This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9583eb787b GH-15047: [Python]: switch from pytz to zoneinfo by default
for string to tzinfo conversion (#49694)
9583eb787b is described below
commit 9583eb787b097a448ada9b13475dbb621fdcbc97
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Fri May 22 17:39:14 2026 +0200
GH-15047: [Python]: switch from pytz to zoneinfo by default for string to
tzinfo conversion (#49694)
### Rationale for this change
`zoneinfo` is available starting with Python 3.9, so we can now assume
that it is available, and so we can switch from returning `pytz`
timezones by default to return `zoneinfo` timezones (or
`datetime.timezone` for fixed offsets).
Only keeping pytz as fallback for strings that are not supported by
`zoneinfo` but were supported by `pytz`. Later, we should maybe
deprecate that fallback.
Generally we should move away from using `pytz`, since the core
functionality of having time zones is now available in the standard
library (`zoneinfo`), and because the pytz package has several warts /
incompatibilities with stdlib datetime
(https://blog.ganssle.io/articles/2018/03/pytz-fastest-footgun.html)
### What changes are included in this PR?
Whenever we create a python timezone object, which is when converting to
pandas or when converting to a `datetime.datetime` object:
- always prefer `zoneinfo` for `datetime.datetime` objects
- prefer `zoneinfo` for pandas objects _if_ pandas >= 3, to align with
the change on the pandas side
(https://github.com/pandas-dev/pandas/issues/34916)
In either case, when preferring `zoneinfo`, we still fall back to `pytz`
for named timezones if `zoneinfo` does not recognize the zone name
(apparently pytz can have some common (older) aliases that might not
always work with zoneinfo).
This fallback is something we could deprecate and remove later on (so we
can eventually remove all usage of pytz)
### Are these changes tested?
Yes
### Are there any user-facing changes?
**This PR includes breaking changes to public APIs.**
It is a different object that we return (different class, i.e.
`zoneinfo.ZoneInfo` instead of a `pytz.tzinfo.BaseTzInfo`, both are
still subclasses of `datetime.tzinfo`), which has some differences in
the API, so for people relying on that, this is a breaking change.
For the conversion to pandas, pandas itself has made this breaking
change anyhow, so for those cases it aligns with that change of pandas.
* GitHub Issue: #15047
---
python/pyarrow/includes/libarrow_python.pxd | 2 +-
python/pyarrow/pandas_compat.py | 7 ++--
python/pyarrow/scalar.pxi | 11 +++++-
python/pyarrow/src/arrow/python/datetime.cc | 26 ++++++++++---
python/pyarrow/src/arrow/python/datetime.h | 2 +-
python/pyarrow/tests/test_pandas.py | 34 ++++++++++++----
python/pyarrow/tests/test_types.py | 60 ++++++++++++++++-------------
python/pyarrow/types.pxi | 12 ++++--
8 files changed, 104 insertions(+), 50 deletions(-)
diff --git a/python/pyarrow/includes/libarrow_python.pxd
b/python/pyarrow/includes/libarrow_python.pxd
index 72c278d3e7..385a2924d1 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -213,7 +213,7 @@ cdef extern from "arrow/python/api.h" namespace
"arrow::py::internal" nogil:
CTimePoint TimePoint_from_ns(int64_t val)
CResult[c_string] TzinfoToString(PyObject* pytzinfo)
- CResult[PyObject*] StringToTzinfo(c_string)
+ CResult[PyObject*] StringToTzinfo(c_string, c_bool)
cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index dfca59cbf5..d27a95b9f9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -787,7 +787,7 @@ def _reconstruct_block(item, columns=None,
extension_columns=None, return_block=
def make_datetimetz(unit, tz):
if _pandas_api.is_v1():
unit = 'ns' # ARROW-3789: Coerce date/timestamp types to
datetime64[ns]
- tz = pa.lib.string_to_tzinfo(tz)
+ tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3())
return _pandas_api.datetimetz_type(unit, tz=tz)
@@ -1183,7 +1183,8 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
# ARROW-13756: if index is timezone aware DataTimeIndex
elif pandas_dtype == "datetimetz":
tz = pa.lib.string_to_tzinfo(
- column_indexes[0]['metadata']['timezone'])
+ column_indexes[0]['metadata']['timezone'],
+ prefer_zoneinfo=_pandas_api.is_ge_v3())
level = pd.to_datetime(level, utc=True).tz_convert(tz)
if _pandas_api.is_ge_v3():
# with pandas 3+, to_datetime returns a unit depending on the
string
@@ -1289,7 +1290,7 @@ def make_tz_aware(series, tz):
"""
Make a datetime64 Series timezone-aware for the given tz
"""
- tz = pa.lib.string_to_tzinfo(tz)
+ tz = pa.lib.string_to_tzinfo(tz, prefer_zoneinfo=_pandas_api.is_ge_v3())
series = (series.dt.tz_localize('utc')
.dt.tz_convert(tz))
return series
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a6377b2bb7..fb7de926ed 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -822,7 +822,16 @@ cdef class TimestampScalar(Scalar):
return None
if not dtype.timezone().empty():
- tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
+ # for datetime.datetime output, always prefer zoneinfo over pytz
+ prefer_zoneinfo = True
+ if _pandas_api.have_pandas and dtype.unit() == TimeUnit_NANO:
+ # but if this method returns a pandas.Timestamp (i.e. pandas
installed
+ # and nano unit) -> adjust preference based on the pandas
version
+ # (i.e. keep returning pytz for older pandas)
+ prefer_zoneinfo = _pandas_api.is_ge_v3()
+ tzinfo = string_to_tzinfo(
+ frombytes(dtype.timezone()), prefer_zoneinfo=prefer_zoneinfo
+ )
else:
tzinfo = None
diff --git a/python/pyarrow/src/arrow/python/datetime.cc
b/python/pyarrow/src/arrow/python/datetime.cc
index 1c4e66064d..6a835c2d37 100644
--- a/python/pyarrow/src/arrow/python/datetime.cc
+++ b/python/pyarrow/src/arrow/python/datetime.cc
@@ -368,13 +368,14 @@ Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject*
pytzinfo) {
// Converted from python. See https://github.com/apache/arrow/pull/7604
// for details.
-Result<PyObject*> StringToTzinfo(const std::string& tz) {
+Result<PyObject*> StringToTzinfo(const std::string& tz, bool prefer_zoneinfo) {
std::string_view sign_str, hour_str, minute_str;
OwnedRef pytz;
OwnedRef zoneinfo;
OwnedRef datetime;
- if (internal::ImportModule("pytz", &pytz).ok()) {
+ // Legacy behavior: prefer pytz objects when available
+ if (!prefer_zoneinfo && internal::ImportModule("pytz", &pytz).ok()) {
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
int sign = -1;
if (sign_str == "+") {
@@ -406,7 +407,7 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
return tzinfo;
}
- // catch fixed offset if pytz is not present
+ // Handle fixed offsets with datetime.timezone
if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
RETURN_NOT_OK(internal::ImportModule("datetime", &datetime));
int sign = -1;
@@ -447,7 +448,7 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
return tzinfo;
}
- // fallback on zoneinfo if tz is string and pytz is not present
+ // Use zoneinfo for named timezones when available
if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) {
OwnedRef class_zoneinfo;
RETURN_NOT_OK(
@@ -456,12 +457,25 @@ Result<PyObject*> StringToTzinfo(const std::string& tz) {
PyUnicode_FromStringAndSize(tz.c_str(),
static_cast<Py_ssize_t>(tz.size())));
auto tzinfo =
PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(),
NULL);
+ if (tzinfo != nullptr) {
+ return tzinfo;
+ }
+
+ // Keep backwards compatibility for named timezones only available in pytz
+ PyErr_Clear();
+ }
+
+ if (internal::ImportModule("pytz", &pytz).ok()) {
+ OwnedRef timezone;
+ RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone",
&timezone));
+ OwnedRef py_tz_string(
+ PyUnicode_FromStringAndSize(tz.c_str(),
static_cast<Py_ssize_t>(tz.size())));
+ auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(),
py_tz_string.obj(), NULL);
RETURN_IF_PYERROR();
return tzinfo;
}
- return Status::Invalid(
- "Pytz package or Python>=3.8 for zoneinfo module must be installed.");
+ return Status::Invalid("The zoneinfo module or pytz package must be
installed.");
}
Result<std::string> TzinfoToString(PyObject* tzinfo) {
diff --git a/python/pyarrow/src/arrow/python/datetime.h
b/python/pyarrow/src/arrow/python/datetime.h
index 9b21eeb434..84f46fe2d1 100644
--- a/python/pyarrow/src/arrow/python/datetime.h
+++ b/python/pyarrow/src/arrow/python/datetime.h
@@ -188,7 +188,7 @@ Result<int64_t> PyDateTime_utcoffset_s(PyObject*
pydatetime);
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
/// GIL must be held when calling this method.
ARROW_PYTHON_EXPORT
-Result<PyObject*> StringToTzinfo(const std::string& tz);
+Result<PyObject*> StringToTzinfo(const std::string& tz, bool prefer_zoneinfo =
true);
/// \brief Convert a time zone object to a string representation.
///
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 090c55065e..4c8e6f8259 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -21,6 +21,7 @@ import json
import multiprocessing as mp
import sys
import warnings
+import zoneinfo
from collections import OrderedDict
from datetime import date, datetime, time, timedelta, timezone
@@ -1168,10 +1169,23 @@ class TestConvertDateTimeLikeTypes:
def test_python_datetime_with_pytz_tzinfo(self):
pytz = pytest.importorskip("pytz")
- for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
- values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
+ timezones_pytz = [pytz.utc, pytz.timezone('US/Eastern'),
pytz.FixedOffset(1)]
+ timezones_zoneinfo = [
+ zoneinfo.ZoneInfo('UTC'),
+ zoneinfo.ZoneInfo('US/Eastern'),
+ timezone(timedelta(minutes=1))
+ ]
+
+ for tz, tz_zoneinfo in zip(timezones_pytz, timezones_zoneinfo):
+ values = [tz.localize(datetime(2018, 1, 1, 12, 23, 45))]
df = pd.DataFrame({'datetime': values})
- _check_pandas_roundtrip(df)
+ if Version(pd.__version__) >= Version("3.0.0"):
+ df_expected = pd.DataFrame(
+ {'datetime': [datetime(2018, 1, 1, 12, 23, 45,
tzinfo=tz_zoneinfo)]}
+ )
+ else:
+ df_expected = None
+ _check_pandas_roundtrip(df, expected=df_expected)
@h.given(st.none() | past.timezones)
@h.settings(deadline=None)
@@ -1183,7 +1197,6 @@ class TestConvertDateTimeLikeTypes:
_check_pandas_roundtrip(df, check_dtype=False)
def test_python_datetime_with_timezone_tzinfo(self):
- pytz = pytest.importorskip("pytz")
from datetime import timezone
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=timezone.utc)]
@@ -1191,14 +1204,19 @@ class TestConvertDateTimeLikeTypes:
df = pd.DataFrame({'datetime': values}, index=values)
_check_pandas_roundtrip(df, preserve_index=True)
- # datetime.timezone is going to be pytz.FixedOffset
hours = 1
tz_timezone = timezone(timedelta(hours=hours))
- tz_pytz = pytz.FixedOffset(hours * 60)
values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
- values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
df = pd.DataFrame({'datetime': values}, index=values)
- df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp)
+ if Version(pd.__version__) < Version("3.0.0"):
+ # datetime.timezone is going to be pytz.FixedOffset
+ pytz = pytest.importorskip("pytz")
+ tz_pytz = pytz.FixedOffset(hours * 60)
+ values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
+ df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp)
+ else:
+ df_exp = None
+
_check_pandas_roundtrip(df, expected=df_exp, preserve_index=True)
def test_python_datetime_subclass(self):
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index 539f017245..4a33d79223 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -20,6 +20,7 @@ from collections.abc import Iterator, Mapping
from functools import partial
import datetime
import sys
+import zoneinfo
import pytest
import hypothesis as h
@@ -491,35 +492,40 @@ def test_convert_custom_tzinfo_objects_to_string():
def test_string_to_tzinfo():
string = ['UTC', 'Europe/Paris', '+03:00', '+01:30', '-02:00']
- try:
- import pytz
- expected = [pytz.utc, pytz.timezone('Europe/Paris'),
- pytz.FixedOffset(180), pytz.FixedOffset(90),
- pytz.FixedOffset(-120)]
- result = [pa.lib.string_to_tzinfo(i) for i in string]
- assert result == expected
-
- except ImportError:
- try:
- import zoneinfo
- expected = [zoneinfo.ZoneInfo(key='UTC'),
- zoneinfo.ZoneInfo(key='Europe/Paris'),
- datetime.timezone(datetime.timedelta(hours=3)),
- datetime.timezone(
- datetime.timedelta(hours=1, minutes=30)),
- datetime.timezone(-datetime.timedelta(hours=2))]
- result = [pa.lib.string_to_tzinfo(i) for i in string]
- assert result == expected
-
- except ImportError:
- pytest.skip('requires pytz or zoneinfo to be installed')
-
-
-def test_timezone_string_roundtrip_pytz():
+ result = [pa.lib.string_to_tzinfo(i) for i in string]
+ expected = [
+ zoneinfo.ZoneInfo('UTC'),
+ zoneinfo.ZoneInfo('Europe/Paris'),
+ datetime.timezone(datetime.timedelta(hours=3)),
+ datetime.timezone(datetime.timedelta(hours=1, minutes=30)),
+ datetime.timezone(-datetime.timedelta(hours=2)),
+ ]
+ assert result == expected
+
+
+def test_string_to_tzinfo_prefer_zoneinfo_false():
pytz = pytest.importorskip("pytz")
+ result = pa.lib.string_to_tzinfo("Europe/Brussels", prefer_zoneinfo=False)
+ assert result == pytz.timezone("Europe/Brussels")
+ result = pa.lib.string_to_tzinfo("+01:30", prefer_zoneinfo=False)
+ assert result == pytz.FixedOffset(90)
+
+
[email protected](
+ sys.platform == 'darwin', reason="macOS supports those lower-case names"
+)
+def test_string_to_tzinfo_pytz_fallback():
+ pytz = pytest.importorskip("pytz")
+ result = pa.lib.string_to_tzinfo("europe/brussels")
+ expected = pytz.timezone("Europe/Brussels")
+ assert result == expected
+
- tz = [pytz.FixedOffset(90), pytz.FixedOffset(-90),
- pytz.utc, pytz.timezone('America/New_York')]
+def test_timezone_string_roundtrip():
+ tz = [datetime.timezone(datetime.timedelta(hours=1, minutes=30)),
+ datetime.timezone(datetime.timedelta(hours=-1, minutes=-30)),
+ zoneinfo.ZoneInfo('UTC'),
+ zoneinfo.ZoneInfo('America/New_York')]
name = ['+01:30', '-01:30', 'UTC', 'America/New_York']
assert [pa.lib.tzinfo_to_string(i) for i in tz] == name
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index e9eef89651..ec1a5a2ba9 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -4166,7 +4166,7 @@ def tzinfo_to_string(tz):
return frombytes(GetResultValue(TzinfoToString(<PyObject*>tz)))
-def string_to_tzinfo(name):
+def string_to_tzinfo(name, *, prefer_zoneinfo=True):
"""
Convert a time zone name into a time zone object.
@@ -4177,15 +4177,21 @@ def string_to_tzinfo(name):
Parameters
----------
- name: str
+ name: str
Time zone name.
+ prefer_zoneinfo : bool, default True
+ If True, resolve named timezones using ``zoneinfo`` first and only
+ fall back to ``pytz`` when needed. If False, prefer ``pytz`` when it
+ is available.
Returns
-------
tz : datetime.tzinfo
Time zone object
"""
- cdef PyObject* tz = GetResultValue(StringToTzinfo(name.encode('utf-8')))
+ cdef PyObject* tz = GetResultValue(
+ StringToTzinfo(name.encode('utf-8'), prefer_zoneinfo)
+ )
return PyObject_to_object(tz)