https://github.com/python/cpython/commit/1298511b41ec0f9be925c12f3830e94fe8f7e7dc
commit: 1298511b41ec0f9be925c12f3830e94fe8f7e7dc
branch: main
author: Tim Hatch <timha...@netflix.com>
committer: gpshead <g...@krypto.org>
date: 2025-05-20T18:32:41-07:00
summary:

gh-72680: Fix false positives when using zipfile.is_zipfile() (GH-134250)

bpo-28494: Improve zipfile.is_zipfile reliability

The zipfile.is_zipfile function would only search for the EndOfZipfile
section header. This failed to correctly identify non-zipfiles that
contained this header. Now the zipfile.is_zipfile function verifies
the first central directory entry.

Changes:
* Extended zipfile.is_zipfile to verify zipfile catalog
* Added tests to validate failure of binary non-zipfiles
* Reuse 'concat' handling for is_zipfile

Co-authored-by: John Jolly <john.jo...@gmail.com>

files:
A Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst
M Lib/test/test_zipfile/test_core.py
M Lib/zipfile/__init__.py

diff --git a/Lib/test/test_zipfile/test_core.py 
b/Lib/test/test_zipfile/test_core.py
index 43056978848c03..e93603998f979e 100644
--- a/Lib/test/test_zipfile/test_core.py
+++ b/Lib/test/test_zipfile/test_core.py
@@ -1991,6 +1991,25 @@ def test_is_zip_erroneous_file(self):
         self.assertFalse(zipfile.is_zipfile(fp))
         fp.seek(0, 0)
         self.assertFalse(zipfile.is_zipfile(fp))
+        # - passing non-zipfile with ZIP header elements
+        # data created using pyPNG like so:
+        #  d = [(ord('P'), ord('K'), 5, 6), (ord('P'), ord('K'), 6, 6)]
+        #  w = png.Writer(1,2,alpha=True,compression=0)
+        #  f = open('onepix.png', 'wb')
+        #  w.write(f, d)
+        #  w.close()
+        data = (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00"
+                b"\x00\x02\x08\x06\x00\x00\x00\x99\x81\xb6'\x00\x00\x00\x15I"
+                b"DATx\x01\x01\n\x00\xf5\xff\x00PK\x05\x06\x00PK\x06\x06\x07"
+                b"\xac\x01N\xc6|a\r\x00\x00\x00\x00IEND\xaeB`\x82")
+        # - passing a filename
+        with open(TESTFN, "wb") as fp:
+            fp.write(data)
+        self.assertFalse(zipfile.is_zipfile(TESTFN))
+        # - passing a file-like object
+        fp = io.BytesIO()
+        fp.write(data)
+        self.assertFalse(zipfile.is_zipfile(fp))
 
     def test_damaged_zipfile(self):
         """Check that zipfiles with missing bytes at the end raise 
BadZipFile."""
diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py
index 894b4d37233923..18caeb3e04a2b5 100644
--- a/Lib/zipfile/__init__.py
+++ b/Lib/zipfile/__init__.py
@@ -234,8 +234,19 @@ def strip(cls, data, xids):
 
 def _check_zipfile(fp):
     try:
-        if _EndRecData(fp):
-            return True         # file has correct magic number
+        endrec = _EndRecData(fp)
+        if endrec:
+            if endrec[_ECD_ENTRIES_TOTAL] == 0 and endrec[_ECD_SIZE] == 0 and 
endrec[_ECD_OFFSET] == 0:
+                return True     # Empty zipfiles are still zipfiles
+            elif endrec[_ECD_DISK_NUMBER] == endrec[_ECD_DISK_START]:
+                # Central directory is on the same disk
+                fp.seek(sum(_handle_prepended_data(endrec)))
+                if endrec[_ECD_SIZE] >= sizeCentralDir:
+                    data = fp.read(sizeCentralDir)   # CD is where we expect 
it to be
+                    if len(data) == sizeCentralDir:
+                        centdir = struct.unpack(structCentralDir, data) # CD 
is the right size
+                        if centdir[_CD_SIGNATURE] == stringCentralDir:
+                            return True         # First central directory 
entry  has correct magic number
     except OSError:
         pass
     return False
@@ -258,6 +269,22 @@ def is_zipfile(filename):
         pass
     return result
 
+def _handle_prepended_data(endrec, debug=0):
+    size_cd = endrec[_ECD_SIZE]             # bytes in central directory
+    offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
+
+    # "concat" is zero, unless zip was concatenated to another file
+    concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
+    if endrec[_ECD_SIGNATURE] == stringEndArchive64:
+        # If Zip64 extension structures are present, account for them
+        concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
+
+    if debug > 2:
+        inferred = concat + offset_cd
+        print("given, inferred, offset", offset_cd, inferred, concat)
+
+    return offset_cd, concat
+
 def _EndRecData64(fpin, offset, endrec):
     """
     Read the ZIP64 end-of-archive records and use that to update endrec
@@ -1501,28 +1528,21 @@ def _RealGetContents(self):
             raise BadZipFile("File is not a zip file")
         if self.debug > 1:
             print(endrec)
-        size_cd = endrec[_ECD_SIZE]             # bytes in central directory
-        offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
         self._comment = endrec[_ECD_COMMENT]    # archive comment
 
-        # "concat" is zero, unless zip was concatenated to another file
-        concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
-        if endrec[_ECD_SIGNATURE] == stringEndArchive64:
-            # If Zip64 extension structures are present, account for them
-            concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
+        offset_cd, concat = _handle_prepended_data(endrec, self.debug)
+
+        # self.start_dir:  Position of start of central directory
+        self.start_dir = offset_cd + concat
 
         # store the offset to the beginning of data for the
         # .data_offset property
         self._data_offset = concat
 
-        if self.debug > 2:
-            inferred = concat + offset_cd
-            print("given, inferred, offset", offset_cd, inferred, concat)
-        # self.start_dir:  Position of start of central directory
-        self.start_dir = offset_cd + concat
         if self.start_dir < 0:
             raise BadZipFile("Bad offset for central directory")
         fp.seek(self.start_dir, 0)
+        size_cd = endrec[_ECD_SIZE]
         data = fp.read(size_cd)
         fp = io.BytesIO(data)
         total = 0
diff --git a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst 
b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst
new file mode 100644
index 00000000000000..0c518983770d5d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst
@@ -0,0 +1 @@
+Improve Zip file validation false positive rate in :func:`zipfile.is_zipfile`.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: arch...@mail-archive.com

Reply via email to