[Python-checkins] gh-118107: Fix zipimporter ZIP64 handling. (GH-118108)

encukou Tue, 07 May 2024 00:23:39 -0700

https://github.com/python/cpython/commit/49258efada0cb0fc58ccffc018ff310b8f7f4570
commit: 49258efada0cb0fc58ccffc018ff310b8f7f4570
branch: main
author: John Sirois <[email protected]>
committer: encukou <[email protected]>
date: 2024-05-07T09:23:27+02:00
summary:


gh-118107: Fix zipimporter ZIP64 handling. (GH-118108)

Add missing import to code that handles too large files and offsets.
Use list, not tuple, for a mutable sequence.

Add tests to prevent similar mistakes.

---------

Co-authored-by: Gregory P. Smith [Google LLC] <[email protected]>
Co-authored-by: Kirill Podoprigora <[email protected]>

files:
A Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part
A Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part
A Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part
A Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst
M Lib/test/test_zipimport.py
M Lib/zipimport.py
M Makefile.pre.in

diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py
index ae49700294330c..e9c3218d2bb39e 100644
--- a/Lib/test/test_zipimport.py
+++ b/Lib/test/test_zipimport.py
@@ -1,8 +1,10 @@
 import sys
 import os
 import marshal
+import glob
 import importlib
 import importlib.util
+import re
 import struct
 import time
 import unittest
@@ -54,6 +56,7 @@ def module_path_to_dotted_name(path):
 TESTPACK2 = "ziptestpackage2"
 TEMP_DIR = os.path.abspath("junk95142")
 TEMP_ZIP = os.path.abspath("junk95142.zip")
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "zipimport_data")
 
 pyc_file = importlib.util.cache_from_source(TESTMOD + '.py')
 pyc_ext = '.pyc'
@@ -134,7 +137,9 @@ def getZip64Files(self):
 
     def doTest(self, expected_ext, files, *modules, **kw):
         self.makeZip(files, **kw)
+        self.doTestWithPreBuiltZip(expected_ext, *modules, **kw)
 
+    def doTestWithPreBuiltZip(self, expected_ext, *modules, **kw):
         sys.path.insert(0, TEMP_ZIP)
 
         mod = importlib.import_module(".".join(modules))
@@ -810,6 +815,122 @@ def testZip64CruftAndComment(self):
         files = self.getZip64Files()
         self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))
 
+    def testZip64LargeFile(self):
+        support.requires(
+            "largefile",
+            f"test generates files >{0xFFFFFFFF} bytes and takes a long time "
+            "to run"
+        )
+
+        # N.B.: We do alot of gymnastics below in the ZIP_STORED case to save
+        # and reconstruct a sparse zip on systems that support sparse files.
+        # Instead of creating a ~8GB zip file mainly consisting of null bytes
+        # for every run of the test, we create the zip once and save off the
+        # non-null portions of the resulting file as data blobs with offsets
+        # that allow re-creating the zip file sparsely. This drops disk space
+        # usage to ~9KB for the ZIP_STORED case and drops that test time by ~2
+        # orders of magnitude. For the ZIP_DEFLATED case, however, we bite the
+        # bullet. The resulting zip file is ~8MB of non-null data; so the 
sparse
+        # trick doesn't work and would result in that full ~8MB zip data file
+        # being checked in to source control.
+        parts_glob = f"sparse-zip64-c{self.compression:d}-0x*.part"
+        full_parts_glob = os.path.join(TEST_DATA_DIR, parts_glob)
+        pre_built_zip_parts = glob.glob(full_parts_glob)
+
+        self.addCleanup(os_helper.unlink, TEMP_ZIP)
+        if not pre_built_zip_parts:
+            if self.compression != ZIP_STORED:
+                support.requires(
+                    "cpu",
+                    "test requires a lot of CPU for compression."
+                )
+            self.addCleanup(os_helper.unlink, os_helper.TESTFN)
+            with open(os_helper.TESTFN, "wb") as f:
+                f.write(b"data")
+                f.write(os.linesep.encode())
+                f.seek(0xffff_ffff, os.SEEK_CUR)
+                f.write(os.linesep.encode())
+            os.utime(os_helper.TESTFN, (0.0, 0.0))
+            with ZipFile(
+                TEMP_ZIP,
+                "w",
+                compression=self.compression,
+                strict_timestamps=False
+            ) as z:
+                z.write(os_helper.TESTFN, "data1")
+                z.writestr(
+                    ZipInfo("module.py", (1980, 1, 1, 0, 0, 0)), test_src
+                )
+                z.write(os_helper.TESTFN, "data2")
+
+            # This "works" but relies on the zip format having a non-empty
+            # final page due to the trailing central directory to wind up with
+            # the correct length file.
+            def make_sparse_zip_parts(name):
+                empty_page = b"\0" * 4096
+                with open(name, "rb") as f:
+                    part = None
+                    try:
+                        while True:
+                            offset = f.tell()
+                            data = f.read(len(empty_page))
+                            if not data:
+                                break
+                            if data != empty_page:
+                                if not part:
+                                    part_fullname = os.path.join(
+                                        TEST_DATA_DIR,
+                                        f"sparse-zip64-c{self.compression:d}-"
+                                        f"{offset:#011x}.part",
+                                    )
+                                    os.makedirs(
+                                        os.path.dirname(part_fullname),
+                                        exist_ok=True
+                                    )
+                                    part = open(part_fullname, "wb")
+                                    print("Created", part_fullname)
+                                part.write(data)
+                            else:
+                                if part:
+                                    part.close()
+                                part = None
+                    finally:
+                        if part:
+                            part.close()
+
+            if self.compression == ZIP_STORED:
+                print(f"Creating sparse parts to check in into 
{TEST_DATA_DIR}:")
+                make_sparse_zip_parts(TEMP_ZIP)
+
+        else:
+            def extract_offset(name):
+                if m := re.search(r"-(0x[0-9a-f]{9})\.part$", name):
+                    return int(m.group(1), base=16)
+                raise ValueError(f"{name=} does not fit expected pattern.")
+            offset_parts = [(extract_offset(n), n) for n in 
pre_built_zip_parts]
+            with open(TEMP_ZIP, "wb") as f:
+                for offset, part_fn in sorted(offset_parts):
+                    with open(part_fn, "rb") as part:
+                        f.seek(offset, os.SEEK_SET)
+                        f.write(part.read())
+            # Confirm that the reconstructed zip file works and looks right.
+            with ZipFile(TEMP_ZIP, "r") as z:
+                self.assertEqual(
+                    z.getinfo("module.py").date_time, (1980, 1, 1, 0, 0, 0)
+                )
+                self.assertEqual(
+                    z.read("module.py"), test_src.encode(),
+                    msg=f"Recreate {full_parts_glob}, unexpected contents."
+                )
+                def assertDataEntry(name):
+                    zinfo = z.getinfo(name)
+                    self.assertEqual(zinfo.date_time, (1980, 1, 1, 0, 0, 0))
+                    self.assertGreater(zinfo.file_size, 0xffff_ffff)
+                assertDataEntry("data1")
+                assertDataEntry("data2")
+
+        self.doTestWithPreBuiltZip(".py", "module")
+
 
 @support.requires_zlib()
 class CompressedZipImportTestCase(UncompressedZipImportTestCase):
diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part
new file mode 100644
index 00000000000000..c6beae8e2552d6
Binary files /dev/null and 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part differ
diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part
new file mode 100644
index 00000000000000..74ab03b4648948
Binary files /dev/null and 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part differ
diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part
new file mode 100644
index 00000000000000..9769a404f675d4
Binary files /dev/null and 
b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part differ
diff --git a/Lib/zipimport.py b/Lib/zipimport.py
index 21d2dca46f569b..4e41d109865e85 100644
--- a/Lib/zipimport.py
+++ b/Lib/zipimport.py
@@ -517,8 +517,9 @@ def _read_directory(archive):
                             num_extra_values = (len(extra_data) - 4) // 8
                             if num_extra_values > 3:
                                 raise ZipImportError(f"can't read header 
extra: {archive!r}", path=archive)
-                            values = 
struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
-                                                        extra_data, offset=4)
+                            import struct
+                            values = 
list(struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
+                                                             extra_data, 
offset=4))
 
                             # N.b. Here be dragons: the ordering of these is 
different than
                             # the header fields, and it's really easy to get 
it wrong since
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 88ec2009fbfffe..74a438b015a97d 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -2483,7 +2483,8 @@ TESTSUBDIRS=      idlelib/idle_test \
                test/typinganndata \
                test/wheeldata \
                test/xmltestdata \
-               test/xmltestdata/c14n-20
+               test/xmltestdata/c14n-20 \
+               test/zipimport_data
 
 COMPILEALL_OPTS=-j0
 
diff --git 
a/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst 
b/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst
new file mode 100644
index 00000000000000..0e358d6605e66e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst
@@ -0,0 +1,2 @@
+Fix :mod:`zipimport` reading of ZIP64 files with file entries that are too big 
or
+offset too far.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

[Python-checkins] gh-118107: Fix zipimporter ZIP64 handling. (GH-118108)

Reply via email to