This is an automated email from the ASF dual-hosted git repository.
sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
The following commit(s) were added to refs/heads/main by this push:
new c766e02 Archive member count limit #604
c766e02 is described below
commit c766e023d82a7bae41b34b0bed9dd08fbefac081
Author: Andrew K. Musselman <[email protected]>
AuthorDate: Tue Jan 27 19:55:20 2026 -0800
Archive member count limit #604
---
atr/archives.py | 71 +++++++++++++++++++++++------------------
atr/tarzip.py | 73 ++++++++++++++++++++++++++++++++++++++++---
atr/tasks/checks/targz.py | 6 ++--
atr/tasks/checks/zipformat.py | 30 +++++++++++-------
atr/util.py | 23 +++++---------
5 files changed, 137 insertions(+), 66 deletions(-)
diff --git a/atr/archives.py b/atr/archives.py
index 0f87d88..0ed6982 100644
--- a/atr/archives.py
+++ b/atr/archives.py
@@ -43,10 +43,19 @@ def extract(
try:
with tarzip.open_archive(archive_path) as archive:
match archive.specific():
- case tarfile.TarFile() as tf:
- for member in tf:
- total_extracted, extracted_paths =
_archive_extract_member(
- tf, member, extract_dir, total_extracted,
max_size, chunk_size, track_files, extracted_paths
+ case tarfile.TarFile():
+ for member in archive:
+ if not isinstance(member, tarzip.TarMember):
+ continue
+ total_extracted, extracted_paths =
_tar_archive_extract_member(
+ archive,
+ member,
+ extract_dir,
+ total_extracted,
+ max_size,
+ chunk_size,
+ track_files,
+ extracted_paths,
)
case zipfile.ZipFile():
@@ -64,7 +73,7 @@ def extract(
extracted_paths,
)
- except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e:
+ except (tarfile.TarError, zipfile.BadZipFile, ValueError,
tarzip.ArchiveMemberLimitExceededError) as e:
raise ExtractionError(f"Failed to read archive: {e}", {"archive_path":
archive_path}) from e
return total_extracted, extracted_paths
@@ -73,8 +82,8 @@ def extract(
def total_size(tgz_path: str, chunk_size: int = 4096) -> int:
with tarzip.open_archive(tgz_path) as archive:
match archive.specific():
- case tarfile.TarFile() as tf:
- total_size = _size_tar(tf, chunk_size)
+ case tarfile.TarFile():
+ total_size = _size_tar(archive, chunk_size)
case zipfile.ZipFile():
total_size = _size_zip(archive, chunk_size)
@@ -82,9 +91,9 @@ def total_size(tgz_path: str, chunk_size: int = 4096) -> int:
return total_size
-def _archive_extract_member( # noqa: C901
- tf: tarfile.TarFile,
- member: tarfile.TarInfo,
+def _tar_archive_extract_member( # noqa: C901
+ archive: tarzip.Archive,
+ member: tarzip.TarMember,
extract_dir: str,
total_extracted: int,
max_size: int,
@@ -107,7 +116,7 @@ def _archive_extract_member( # noqa: C901
extracted_paths.append(member.name)
# Check whether extraction would exceed the size limit
- if member.isreg() and ((total_extracted + member.size) > max_size):
+ if member.isfile() and ((total_extracted + member.size) > max_size):
raise ExtractionError(
f"Extraction would exceed maximum size limit of {max_size} bytes",
{"max_size": max_size, "current_size": total_extracted,
"file_size": member.size},
@@ -119,32 +128,32 @@ def _archive_extract_member( # noqa: C901
if _safe_path(extract_dir, member.name) is None:
log.warning(f"Skipping potentially unsafe path: {member.name}")
return 0, extracted_paths
- tf.extract(member, extract_dir, numeric_owner=True,
filter="fully_trusted")
+ archive.extract_member(member, extract_dir, numeric_owner=True,
tar_filter="fully_trusted")
- elif member.isreg():
- extracted_size = _archive_extract_safe_process_file(
- tf, member, extract_dir, total_extracted, max_size, chunk_size
+ elif member.isfile():
+ extracted_size = _tar_archive_extract_safe_process_file(
+ archive, member, extract_dir, total_extracted, max_size, chunk_size
)
total_extracted += extracted_size
elif member.issym():
- _archive_extract_safe_process_symlink(member, extract_dir)
+ _tar_archive_extract_safe_process_symlink(member, extract_dir)
elif member.islnk():
- _archive_extract_safe_process_hardlink(member, extract_dir)
+ _tar_archive_extract_safe_process_hardlink(member, extract_dir)
return total_extracted, extracted_paths
-def _archive_extract_safe_process_file(
- tf: tarfile.TarFile,
- member: tarfile.TarInfo,
+def _tar_archive_extract_safe_process_file(
+ archive: tarzip.Archive,
+ member: tarzip.TarMember,
extract_dir: str,
total_extracted: int,
max_size: int,
chunk_size: int,
) -> int:
- """Process a single file member during safe archive extraction."""
+ """Process a single file member during safe tar archive extraction."""
target_path = _safe_path(extract_dir, member.name)
if target_path is None:
log.warning(f"Skipping potentially unsafe path: {member.name}")
@@ -152,9 +161,9 @@ def _archive_extract_safe_process_file(
os.makedirs(os.path.dirname(target_path), exist_ok=True)
- source = tf.extractfile(member)
+ source = archive.extractfile(member)
if source is None:
- # Should not happen if member.isreg() is true
+ # Should not happen if member.isfile() is true
log.warning(f"Could not extract file object for member: {member.name}")
return 0
@@ -180,8 +189,8 @@ def _archive_extract_safe_process_file(
return extracted_file_size
-def _archive_extract_safe_process_hardlink(member: tarfile.TarInfo,
extract_dir: str) -> None:
- """Safely create a hard link from the TarInfo entry."""
+def _tar_archive_extract_safe_process_hardlink(member: tarzip.TarMember,
extract_dir: str) -> None:
+ """Safely create a hard link from the TarMember entry."""
target_path = _safe_path(extract_dir, member.name)
if target_path is None:
log.warning(f"Skipping potentially unsafe hard link path:
{member.name}")
@@ -203,8 +212,8 @@ def _archive_extract_safe_process_hardlink(member:
tarfile.TarInfo, extract_dir:
log.warning(f"Failed to create hard link {target_path} ->
{source_path}: {e}")
-def _archive_extract_safe_process_symlink(member: tarfile.TarInfo,
extract_dir: str) -> None:
- """Safely create a symbolic link from the TarInfo entry."""
+def _tar_archive_extract_safe_process_symlink(member: tarzip.TarMember,
extract_dir: str) -> None:
+ """Safely create a symbolic link from the TarMember entry."""
target_path = _safe_path(extract_dir, member.name)
if target_path is None:
log.warning(f"Skipping potentially unsafe symlink path: {member.name}")
@@ -242,12 +251,14 @@ def _safe_path(base_dir: str, *paths: str) -> str | None:
return None
-def _size_tar(tf: tarfile.TarFile, chunk_size: int) -> int:
+def _size_tar(archive: tarzip.Archive, chunk_size: int) -> int:
total_size = 0
- for member in tf:
+ for member in archive:
+ if not isinstance(member, tarzip.TarMember):
+ continue
total_size += member.size
if member.isfile():
- fileobj = tf.extractfile(member)
+ fileobj = archive.extractfile(member)
if fileobj is not None:
while fileobj.read(chunk_size):
pass
diff --git a/atr/tarzip.py b/atr/tarzip.py
index 391b1cb..7ad6a66 100644
--- a/atr/tarzip.py
+++ b/atr/tarzip.py
@@ -19,9 +19,16 @@ import contextlib
import tarfile
import zipfile
from collections.abc import Generator, Iterator
-from typing import IO, TypeVar
+from typing import IO, Final, Literal, TypeVar
from typing import Protocol as TypingProtocol
+MAX_ARCHIVE_MEMBERS: Final[int] = 100_000
+
+
+class ArchiveMemberLimitExceededError(Exception):
+ pass
+
+
ArchiveT = TypeVar("ArchiveT", tarfile.TarFile, zipfile.ZipFile)
# If you set this covariant=True, then mypy says an "invariant one is expected"
# But pyright warns, "should be covariant" if you don't
@@ -96,19 +103,32 @@ type Member = TarMember | ZipMember
class ArchiveContext[ArchiveT: (tarfile.TarFile, zipfile.ZipFile)]:
_archive_obj: ArchiveT
+ _max_members: int
- def __init__(self, archive_obj: ArchiveT):
+ def __init__(self, archive_obj: ArchiveT, max_members: int =
MAX_ARCHIVE_MEMBERS):
self._archive_obj = archive_obj
+ self._max_members = max_members
def __iter__(self) -> Iterator[TarMember | ZipMember]:
+ count = 0
match self._archive_obj:
case tarfile.TarFile() as tf:
for member_orig in tf:
if member_orig.isdev():
continue
+ count += 1
+ if count > self._max_members:
+ raise ArchiveMemberLimitExceededError(
+ f"Archive contains too many members: exceeded
limit of {self._max_members}"
+ )
yield TarMember(member_orig)
case zipfile.ZipFile() as zf:
for member_orig in zf.infolist():
+ count += 1
+ if count > self._max_members:
+ raise ArchiveMemberLimitExceededError(
+ f"Archive contains too many members: exceeded
limit of {self._max_members}"
+ )
yield ZipMember(member_orig)
def extractfile(self, member_wrapper: Member) -> IO[bytes] | None:
@@ -125,6 +145,34 @@ class ArchiveContext[ArchiveT: (tarfile.TarFile,
zipfile.ZipFile)]:
except (KeyError, AttributeError, Exception):
return None
+ def extract_member(
+ self,
+ member_wrapper: Member,
+ path: str,
+ *,
+ numeric_owner: bool = True,
+ tar_filter: Literal["fully_trusted", "tar", "data"] | None =
"fully_trusted",
+ ) -> None:
+ """Extract a single member to the specified path.
+
+ For tar archives, this uses the tarfile extract method with the
specified filter.
+ For zip archives, this is a no-op as zip extraction is handled
differently
+ (directories are created via os.makedirs in the calling code).
+ """
+ match self._archive_obj:
+ case tarfile.TarFile() as tf:
+ if not isinstance(member_wrapper, TarMember):
+ raise TypeError("Archive is TarFile, but member_wrapper is
not TarMember")
+ tf.extract(
+ member_wrapper._original_info,
+ path,
+ numeric_owner=numeric_owner,
+ filter=tar_filter,
+ )
+ case zipfile.ZipFile():
+ # Zip extraction is handled differently in the calling code
+ pass
+
def specific(self) -> tarfile.TarFile | zipfile.ZipFile:
return self._archive_obj
@@ -135,7 +183,22 @@ type Archive = TarArchive | ZipArchive
@contextlib.contextmanager
-def open_archive(archive_path: str) -> Generator[Archive]:
+def open_archive(
+ archive_path: str,
+ max_members: int = MAX_ARCHIVE_MEMBERS,
+) -> Generator[Archive]:
+ """Open an archive file (tar or zip) and yield an ArchiveContext.
+
+ Args:
+ archive_path: Path to the archive file.
+ max_members: Maximum number of members allowed in the archive.
+ Defaults to MAX_ARCHIVE_MEMBERS (100,000).
+ Set to 0 or negative to disable the limit.
+
+ Raises:
+ ValueError: If the archive format is unsupported or corrupted.
+ ArchiveMemberLimitExceededError: If the archive exceeds max_members
during iteration.
+ """
archive_file: tarfile.TarFile | zipfile.ZipFile | None = None
try:
try:
@@ -148,9 +211,9 @@ def open_archive(archive_path: str) -> Generator[Archive]:
match archive_file:
case tarfile.TarFile() as tf_concrete:
- yield ArchiveContext[tarfile.TarFile](tf_concrete)
+ yield ArchiveContext[tarfile.TarFile](tf_concrete, max_members)
case zipfile.ZipFile() as zf_concrete:
- yield ArchiveContext[zipfile.ZipFile](zf_concrete)
+ yield ArchiveContext[zipfile.ZipFile](zf_concrete, max_members)
finally:
if archive_file:
diff --git a/atr/tasks/checks/targz.py b/atr/tasks/checks/targz.py
index c3302e9..18ae3f4 100644
--- a/atr/tasks/checks/targz.py
+++ b/atr/tasks/checks/targz.py
@@ -16,12 +16,12 @@
# under the License.
import asyncio
-import tarfile
from typing import Final
import atr.archives as archives
import atr.log as log
import atr.models.results as results
+import atr.tarzip as tarzip
import atr.tasks.checks as checks
@@ -52,8 +52,8 @@ def root_directory(tgz_path: str) -> str:
"""Find the root directory in a tar archive and validate that it has only
one root dir."""
root = None
- with tarfile.open(tgz_path, mode="r|gz") as tf:
- for member in tf:
+ with tarzip.open_archive(tgz_path) as archive:
+ for member in archive:
if member.name and member.name.split("/")[-1].startswith("._"):
# Metadata convention
continue
diff --git a/atr/tasks/checks/zipformat.py b/atr/tasks/checks/zipformat.py
index cdce89e..d69138f 100644
--- a/atr/tasks/checks/zipformat.py
+++ b/atr/tasks/checks/zipformat.py
@@ -22,6 +22,7 @@ from typing import Any
import atr.log as log
import atr.models.results as results
+import atr.tarzip as tarzip
import atr.tasks.checks as checks
import atr.util as util
@@ -76,11 +77,13 @@ async def structure(args: checks.FunctionArguments) ->
results.Results | None:
def _integrity_check_core_logic(artifact_path: str) -> dict[str, Any]:
"""Verify that a zip file can be opened and its members listed."""
try:
- with zipfile.ZipFile(artifact_path, "r") as zf:
+ with tarzip.open_archive(artifact_path) as archive:
# This is a simple check using list members
# We can use zf.testzip() for CRC checks if needed, though this
will be slower
- member_list = zf.namelist()
- return {"member_count": len(member_list)}
+ member_count = sum(1 for _ in archive)
+ return {"member_count": member_count}
+ except tarzip.ArchiveMemberLimitExceededError as e:
+ return {"error": f"Archive has too many members: {e}"}
except zipfile.BadZipFile as e:
return {"error": f"Bad zip file: {e}"}
except FileNotFoundError:
@@ -92,8 +95,8 @@ def _integrity_check_core_logic(artifact_path: str) ->
dict[str, Any]:
def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]:
"""Verify the internal structure of the zip archive."""
try:
- with zipfile.ZipFile(artifact_path, "r") as zf:
- members = zf.namelist()
+ with tarzip.open_archive(artifact_path) as archive:
+ members: list[tarzip.Member] = list(archive)
if not members:
return {"error": "Archive is empty"}
@@ -107,9 +110,10 @@ def _structure_check_core_logic(artifact_path: str) ->
dict[str, Any]:
# name_part = "-".join(name_part.split("-")[:-1])
expected_root = name_part
- root_dirs, non_rooted_files =
_structure_check_core_logic_find_roots(zf, members)
+ root_dirs, non_rooted_files =
_structure_check_core_logic_find_roots(members)
+ member_names = [m.name for m in members]
actual_root, error_msg = _structure_check_core_logic_validate_root(
- members, root_dirs, non_rooted_files, expected_root
+ member_names, root_dirs, non_rooted_files, expected_root
)
if error_msg:
@@ -121,6 +125,8 @@ def _structure_check_core_logic(artifact_path: str) ->
dict[str, Any]:
return {"root_dir": actual_root}
return {"error": "Unknown structure validation error"}
+ except tarzip.ArchiveMemberLimitExceededError as e:
+ return {"error": f"Archive has too many members: {e}"}
except zipfile.BadZipFile as e:
return {"error": f"Bad zip file: {e}"}
except FileNotFoundError:
@@ -129,15 +135,15 @@ def _structure_check_core_logic(artifact_path: str) ->
dict[str, Any]:
return {"error": f"Unexpected error: {e}"}
-def _structure_check_core_logic_find_roots(zf: zipfile.ZipFile, members:
list[str]) -> tuple[set[str], list[str]]:
+def _structure_check_core_logic_find_roots(members: list[tarzip.Member]) ->
tuple[set[str], list[str]]:
"""Identify root directories and non-rooted files in a zip archive."""
root_dirs: set[str] = set()
non_rooted_files: list[str] = []
for member in members:
- if "/" in member:
- root_dirs.add(member.split("/", 1)[0])
- elif not zipfile.Path(zf, member).is_dir():
- non_rooted_files.append(member)
+ if "/" in member.name:
+ root_dirs.add(member.name.split("/", 1)[0])
+ elif not member.isdir():
+ non_rooted_files.append(member.name)
return root_dirs, non_rooted_files
diff --git a/atr/util.py b/atr/util.py
index ab9b223..b736abb 100644
--- a/atr/util.py
+++ b/atr/util.py
@@ -51,6 +51,7 @@ import atr.ldap as ldap
import atr.log as log
import atr.models.sql as sql
import atr.registry as registry
+import atr.tarzip as tarzip
import atr.user as user
T = TypeVar("T")
@@ -96,26 +97,16 @@ async def archive_listing(file_path: pathlib.Path) ->
list[str] | None:
return None
with contextlib.suppress(Exception):
- if file_path.name.endswith((".tar.gz", ".tgz")):
+ if file_path.name.endswith((".tar.gz", ".tgz", ".zip")):
- def _read_tar() -> list[str] | None:
- with contextlib.suppress(tarfile.ReadError, EOFError,
ValueError, OSError):
- with tarfile.open(file_path, mode="r:*") as tf:
+ def _read_archive() -> list[str] | None:
+ with contextlib.suppress(tarfile.ReadError,
zipfile.BadZipFile, EOFError, ValueError, OSError):
+ with tarzip.open_archive(str(file_path)) as archive:
# TODO: Skip metadata files
- return sorted(tf.getnames())
+ return sorted(member.name for member in archive)
return None
- return await asyncio.to_thread(_read_tar)
-
- elif file_path.name.endswith(".zip"):
-
- def _read_zip() -> list[str] | None:
- with contextlib.suppress(zipfile.BadZipFile, EOFError,
ValueError, OSError):
- with zipfile.ZipFile(file_path, "r") as zf:
- return sorted(zf.namelist())
- return None
-
- return await asyncio.to_thread(_read_zip)
+ return await asyncio.to_thread(_read_archive)
return None
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]