This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new e52017a727 GH-37328: [Python] Add a function to download and extract 
timezone database on Windows (#38179)
e52017a727 is described below

commit e52017a72735d502c3ac3323d9d1fc61a15a6ae0
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Mar 20 08:59:14 2024 +0100

    GH-37328: [Python] Add a function to download and extract timezone database 
on Windows (#38179)
    
    ### Rationale for this change
    
    There is a section in the [Arrow C++ documentation with the 
instructions](https://arrow.apache.org/docs/dev/cpp/build_system.html#runtime-dependencies)
 on how to download and extract text version of the IANA timezone database and 
on Windows. We should provide a function in PyArrow that a user would call to 
download and extract the timezone database from Python.
    
    ### What changes are included in this PR?
    
    Function `download_tzdata_on_windows()` added to python/pyarrow/util.py 
that downloads and extracts timezone database to a standard location in 
`%USERPROFILE%\Downloads\tzdata` on Widnows.
    
    ### Are these changes tested?
    Yes.
    
    ### Are there any user-facing changes?
    No.
    * Closes: #37328
    
    Lead-authored-by: AlenkaF <[email protected]>
    Co-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 docs/source/python/install.rst    |  3 ++-
 python/pyarrow/tests/test_util.py | 22 +++++++++++++++++++++-
 python/pyarrow/util.py            | 28 ++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst
index 4555977ece..4b966e6d26 100644
--- a/docs/source/python/install.rst
+++ b/docs/source/python/install.rst
@@ -82,7 +82,8 @@ tzdata on Windows
 While Arrow uses the OS-provided timezone database on Linux and macOS, it 
requires a
 user-provided database on Windows. To download and extract the text version of
 the IANA timezone database follow the instructions in the C++
-:ref:`download-timezone-database`.
+:ref:`download-timezone-database` or use pyarrow utility function
+`pyarrow.util.download_tzdata_on_windows()` that does the same.
 
 By default, the timezone database will be detected at 
``%USERPROFILE%\Downloads\tzdata``.
 If the database has been downloaded in a different location, you will need to 
set
diff --git a/python/pyarrow/tests/test_util.py 
b/python/pyarrow/tests/test_util.py
index 9fccb76112..e584b04111 100644
--- a/python/pyarrow/tests/test_util.py
+++ b/python/pyarrow/tests/test_util.py
@@ -16,14 +16,17 @@
 # under the License.
 
 import gc
+import os
 import signal
+import shutil
 import sys
 import textwrap
 import weakref
 
 import pytest
 
-from pyarrow.util import doc, _break_traceback_cycle_from_frame
+from pyarrow.util import (doc, _break_traceback_cycle_from_frame,
+                          download_tzdata_on_windows)
 from pyarrow.tests.util import disabled_gc
 
 
@@ -207,3 +210,20 @@ def test_signal_refcycle():
         assert wr() is not None
         _break_traceback_cycle_from_frame(sys._getframe(0))
         assert wr() is None
+
+
[email protected](sys.platform != "win32",
+                    reason="Timezone database is already provided.")
+def test_download_tzdata_on_windows():
+    tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+
+    # Download timezone database and remove data in case it already exists
+    if (os.path.exists(tzdata_path)):
+        shutil.rmtree(tzdata_path)
+    download_tzdata_on_windows()
+
+    # Inspect the folder
+    assert os.path.exists(tzdata_path)
+    assert os.path.exists(os.path.join(tzdata_path, "windowsZones.xml"))
+    assert os.path.exists(os.path.join(tzdata_path, "europe"))
+    assert 'version' in os.listdir(tzdata_path)
diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py
index bb693cd663..89780da10f 100644
--- a/python/pyarrow/util.py
+++ b/python/pyarrow/util.py
@@ -228,3 +228,31 @@ def _break_traceback_cycle_from_frame(frame):
         # us visit the outer frame).
         refs = gc.get_referrers(frame)
     refs = frame = this_frame = None
+
+
+def download_tzdata_on_windows():
+    r"""
+    Download and extract latest IANA timezone database into the
+    location expected by Arrow which is %USERPROFILE%\Downloads\tzdata.
+    """
+    if sys.platform != 'win32':
+        raise TypeError(f"Timezone database is already provided by 
{sys.platform}")
+
+    import tarfile
+
+    tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+    tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz")
+    os.makedirs(tzdata_path, exist_ok=True)
+
+    from urllib.request import urlopen
+    with urlopen('https://data.iana.org/time-zones/tzdata-latest.tar.gz') as 
response:
+        with open(tzdata_compressed, 'wb') as f:
+            f.write(response.read())
+
+    assert os.path.exists(tzdata_compressed)
+
+    tarfile.open(tzdata_compressed).extractall(tzdata_path)
+
+    with 
urlopen('https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml')
 as response_zones:   # noqa
+        with open(os.path.join(tzdata_path, "windowsZones.xml"), 'wb') as f:
+            f.write(response_zones.read())

Reply via email to