This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e52017a727 GH-37328: [Python] Add a function to download and extract
timezone database on Windows (#38179)
e52017a727 is described below
commit e52017a72735d502c3ac3323d9d1fc61a15a6ae0
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Mar 20 08:59:14 2024 +0100
GH-37328: [Python] Add a function to download and extract timezone database
on Windows (#38179)
### Rationale for this change
There is a section in the [Arrow C++ documentation with the
instructions](https://arrow.apache.org/docs/dev/cpp/build_system.html#runtime-dependencies)
on how to download and extract text version of the IANA timezone database and
on Windows. We should provide a function in PyArrow that a user would call to
download and extract the timezone database from Python.
### What changes are included in this PR?
Function `download_tzdata_on_windows()` added to python/pyarrow/util.py
that downloads and extracts timezone database to a standard location in
`%USERPROFILE%\Downloads\tzdata` on Widnows.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* Closes: #37328
Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
docs/source/python/install.rst | 3 ++-
python/pyarrow/tests/test_util.py | 22 +++++++++++++++++++++-
python/pyarrow/util.py | 28 ++++++++++++++++++++++++++++
3 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst
index 4555977ece..4b966e6d26 100644
--- a/docs/source/python/install.rst
+++ b/docs/source/python/install.rst
@@ -82,7 +82,8 @@ tzdata on Windows
While Arrow uses the OS-provided timezone database on Linux and macOS, it
requires a
user-provided database on Windows. To download and extract the text version of
the IANA timezone database follow the instructions in the C++
-:ref:`download-timezone-database`.
+:ref:`download-timezone-database` or use pyarrow utility function
+`pyarrow.util.download_tzdata_on_windows()` that does the same.
By default, the timezone database will be detected at
``%USERPROFILE%\Downloads\tzdata``.
If the database has been downloaded in a different location, you will need to
set
diff --git a/python/pyarrow/tests/test_util.py
b/python/pyarrow/tests/test_util.py
index 9fccb76112..e584b04111 100644
--- a/python/pyarrow/tests/test_util.py
+++ b/python/pyarrow/tests/test_util.py
@@ -16,14 +16,17 @@
# under the License.
import gc
+import os
import signal
+import shutil
import sys
import textwrap
import weakref
import pytest
-from pyarrow.util import doc, _break_traceback_cycle_from_frame
+from pyarrow.util import (doc, _break_traceback_cycle_from_frame,
+ download_tzdata_on_windows)
from pyarrow.tests.util import disabled_gc
@@ -207,3 +210,20 @@ def test_signal_refcycle():
assert wr() is not None
_break_traceback_cycle_from_frame(sys._getframe(0))
assert wr() is None
+
+
[email protected](sys.platform != "win32",
+ reason="Timezone database is already provided.")
+def test_download_tzdata_on_windows():
+ tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+
+ # Download timezone database and remove data in case it already exists
+ if (os.path.exists(tzdata_path)):
+ shutil.rmtree(tzdata_path)
+ download_tzdata_on_windows()
+
+ # Inspect the folder
+ assert os.path.exists(tzdata_path)
+ assert os.path.exists(os.path.join(tzdata_path, "windowsZones.xml"))
+ assert os.path.exists(os.path.join(tzdata_path, "europe"))
+ assert 'version' in os.listdir(tzdata_path)
diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py
index bb693cd663..89780da10f 100644
--- a/python/pyarrow/util.py
+++ b/python/pyarrow/util.py
@@ -228,3 +228,31 @@ def _break_traceback_cycle_from_frame(frame):
# us visit the outer frame).
refs = gc.get_referrers(frame)
refs = frame = this_frame = None
+
+
+def download_tzdata_on_windows():
+ r"""
+ Download and extract latest IANA timezone database into the
+ location expected by Arrow which is %USERPROFILE%\Downloads\tzdata.
+ """
+ if sys.platform != 'win32':
+ raise TypeError(f"Timezone database is already provided by
{sys.platform}")
+
+ import tarfile
+
+ tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+ tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz")
+ os.makedirs(tzdata_path, exist_ok=True)
+
+ from urllib.request import urlopen
+ with urlopen('https://data.iana.org/time-zones/tzdata-latest.tar.gz') as
response:
+ with open(tzdata_compressed, 'wb') as f:
+ f.write(response.read())
+
+ assert os.path.exists(tzdata_compressed)
+
+ tarfile.open(tzdata_compressed).extractall(tzdata_path)
+
+ with
urlopen('https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml')
as response_zones: # noqa
+ with open(os.path.join(tzdata_path, "windowsZones.xml"), 'wb') as f:
+ f.write(response_zones.read())