This is an automated email from the ASF dual-hosted git repository. sbp pushed a commit to branch sbp in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
commit b280bcf1ee3df6de3c78a949eae2379fa7e5c34a Author: Sean B. Palmer <[email protected]> AuthorDate: Tue Mar 10 18:58:56 2026 +0000 Check .tar.gz structure using the extracted files only --- atr/tasks/checks/__init__.py | 17 +++ atr/tasks/checks/targz.py | 70 ++++----- tests/unit/test_archive_member_limit.py | 20 --- tests/unit/test_archive_root_variants.py | 238 +++++++++++++------------------ 4 files changed, 155 insertions(+), 190 deletions(-) diff --git a/atr/tasks/checks/__init__.py b/atr/tasks/checks/__init__.py index 2e552c9b..c223f83d 100644 --- a/atr/tasks/checks/__init__.py +++ b/atr/tasks/checks/__init__.py @@ -312,6 +312,23 @@ def function_key(func: Callable[..., Any] | str) -> str: return func.__module__ + "." + func.__name__ if callable(func) else func +async def resolve_cache_dir(args: FunctionArguments) -> pathlib.Path | None: + """Resolve the quarantine extraction cache directory for the primary archive.""" + if args.primary_rel_path is None: + return None + paths_data = await attestable.load_paths(args.project_name, args.version_name, args.revision_number) + if paths_data is None: + return None + content_hash = paths_data.get(args.primary_rel_path) + if content_hash is None: + return None + cache_key = hashes.filesystem_cache_archives_key(content_hash) + cache_dir = file_paths.get_cache_archives_dir() / str(args.project_name) / str(args.version_name) / cache_key + if await aiofiles.os.path.isdir(cache_dir): + return cache_dir + return None + + async def resolve_cache_key( checker: str | Callable[..., Any], checker_version: str, diff --git a/atr/tasks/checks/targz.py b/atr/tasks/checks/targz.py index 19f753c8..02534032 100644 --- a/atr/tasks/checks/targz.py +++ b/atr/tasks/checks/targz.py @@ -16,6 +16,10 @@ # under the License. import asyncio +import contextlib +import os +import pathlib +import stat from typing import Final import atr.archives as archives @@ -28,7 +32,7 @@ import atr.util as util # Release policy fields which this check relies on - used for result caching INPUT_POLICY_KEYS: Final[list[str]] = [] INPUT_EXTRA_ARGS: Final[list[str]] = [] -CHECK_VERSION: Final[str] = "1" +CHECK_VERSION: Final[str] = "2" class RootDirectoryError(Exception): @@ -56,50 +60,48 @@ async def integrity(args: checks.FunctionArguments) -> results.Results | None: return None -def root_directory(tgz_path: str) -> tuple[str, bytes | None]: # noqa: C901 - """Find root directory and extract package/package.json if found.""" - root = None - package_json: bytes | None = None +def root_directory(cache_dir: pathlib.Path) -> tuple[str, bytes | None]: + """Find root directory and read package/package.json from the extracted tree.""" + # The ._ prefix is a metadata convention + entries = sorted(e for e in os.listdir(cache_dir) if not e.startswith("._")) - with tarzip.open_archive(tgz_path) as archive: - for member in archive: - if member.name and member.name.split("/")[-1].startswith("._"): - # Metadata convention - continue - - parts = member.name.split("/", 1) - if len(parts) >= 1: - if root is None: - root = parts[0] - elif parts[0] != root: - raise RootDirectoryError(f"Multiple root directories found: {root}, {parts[0]}") - - if (root == "package") and (package_json is None): - member_name = member.name.lstrip("./") - if (member_name == "package/package.json") and member.isfile(): - size = member.size if hasattr(member, "size") else 0 - if (size > 0) and (size <= util.NPM_PACKAGE_JSON_MAX_SIZE): - f = archive.extractfile(member) - if f is not None: - try: - package_json = f.read() - finally: - f.close() - - if not root: + if not entries: raise RootDirectoryError("No root directory found in archive") + if len(entries) > 1: + raise RootDirectoryError(f"Multiple root directories found: {entries[0]}, {entries[1]}") + + root = entries[0] + package_json: bytes | None = None + + if root == "package": + package_json_path = cache_dir / "package" / "package.json" + with contextlib.suppress(FileNotFoundError, OSError): + package_json_stat = package_json_path.lstat() + # We do this to avoid allowing package.json to be a symlink + if stat.S_ISREG(package_json_stat.st_mode): + size = package_json_stat.st_size + if (size > 0) and (size <= util.NPM_PACKAGE_JSON_MAX_SIZE): + package_json = package_json_path.read_bytes() return root, package_json async def structure(args: checks.FunctionArguments) -> results.Results | None: # noqa: C901 - """Check the structure of a .tar.gz file.""" + """Check the structure of a .tar.gz file using the extracted tree.""" recorder = await args.recorder() if not (artifact_abs_path := await recorder.abs_path()): return None if await recorder.primary_path_is_binary(): return None + cache_dir = await checks.resolve_cache_dir(args) + if cache_dir is None: + await recorder.failure( + "Extracted archive tree is not available", + {"rel_path": args.primary_rel_path}, + ) + return None + filename = artifact_abs_path.name basename_from_filename: Final[str] = ( filename.removesuffix(".tar.gz") if filename.endswith(".tar.gz") else filename.removesuffix(".tgz") @@ -112,7 +114,7 @@ async def structure(args: checks.FunctionArguments) -> results.Results | None: ) try: - root, package_json = await asyncio.to_thread(root_directory, str(artifact_abs_path)) + root, package_json = await asyncio.to_thread(root_directory, cache_dir) data: dict[str, object] = { "root": root, "basename_from_filename": basename_from_filename, @@ -149,8 +151,6 @@ async def structure(args: checks.FunctionArguments) -> results.Results | None: await recorder.failure( f"Root directory '{root}' does not match expected names '{expected_roots_display}'", data ) - except tarzip.ArchiveMemberLimitExceededError as e: - await recorder.failure(f"Archive has too many members: {e}", {"error": str(e)}) except RootDirectoryError as e: await recorder.failure("Could not get the root directory of the archive", {"error": str(e)}) except Exception as e: diff --git a/tests/unit/test_archive_member_limit.py b/tests/unit/test_archive_member_limit.py index c3225fb3..b2ae672b 100644 --- a/tests/unit/test_archive_member_limit.py +++ b/tests/unit/test_archive_member_limit.py @@ -142,26 +142,6 @@ async def test_targz_integrity_reports_member_limit(tmp_path, monkeypatch): assert any("too many members" in message.lower() for _, message, _ in recorder.messages) [email protected] -async def test_targz_structure_reports_member_limit(tmp_path, monkeypatch): - archive_path = tmp_path / "sample.tar" - # Must include the root directory here - _make_tar(archive_path, ["sample/a.txt", "sample/b.txt", "sample/c.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_member_limit") - - original_open = tarzip.open_archive - - def limited_open(path: str, *args, **kwargs): - return original_open(path, max_members=2) - - monkeypatch.setattr(tarzip, "open_archive", limited_open) - - args = await _args_for(recorder) - await targz.structure(args) - - assert any("too many members" in message.lower() for _, message, _ in recorder.messages) - - def test_zipformat_integrity_reports_member_limit(tmp_path, monkeypatch): archive_path = tmp_path / "sample.zip" _make_zip(archive_path, ["a.txt", "b.txt", "c.txt"]) diff --git a/tests/unit/test_archive_root_variants.py b/tests/unit/test_archive_root_variants.py index c71ce3d7..7b7b56f2 100644 --- a/tests/unit/test_archive_root_variants.py +++ b/tests/unit/test_archive_root_variants.py @@ -15,10 +15,9 @@ # specific language governing permissions and limitations # under the License. -import io import json import pathlib -import tarfile +import unittest.mock as mock import zipfile import pytest @@ -32,26 +31,17 @@ import tests.unit.recorders as recorders @pytest.mark.asyncio async def test_targz_structure_accepts_npm_pack_root(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "example-1.2.3.tgz" - _make_tar_gz_with_contents( - archive_path, + cache_dir = _make_cache_tree_with_contents( + tmp_path, { "package/package.json": json.dumps({"name": "example", "version": "1.2.3"}), "package/README.txt": "hello", }, ) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ in recorder.messages) assert not any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @@ -59,66 +49,50 @@ async def test_targz_structure_accepts_npm_pack_root(tmp_path: pathlib.Path) -> @pytest.mark.asyncio async def test_targz_structure_accepts_source_suffix_variant(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3-source.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3-source.tar.gz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ in recorder.messages) @pytest.mark.asyncio async def test_targz_structure_accepts_src_suffix_variant(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3-src.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3-src.tar.gz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ in recorder.messages) [email protected] +async def test_targz_structure_fails_when_cache_unavailable(tmp_path: pathlib.Path) -> None: + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3.tar.gz") + + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=None)): + await targz.structure(args) + + assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) + assert any("extracted archive tree is not available" in message.lower() for _, message, _ in recorder.messages) + + @pytest.mark.asyncio async def test_targz_structure_rejects_npm_pack_filename_mismatch(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "example-1.2.3.tgz" - _make_tar_gz_with_contents( - archive_path, + cache_dir = _make_cache_tree_with_contents( + tmp_path, { "package/package.json": json.dumps({"name": "different", "version": "1.2.3"}), "package/README.txt": "hello", }, ) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) assert any("npm pack layout detected" in message for _, message, _ in recorder.messages) @@ -126,105 +100,78 @@ async def test_targz_structure_rejects_npm_pack_filename_mismatch(tmp_path: path @pytest.mark.asyncio async def test_targz_structure_rejects_package_root_without_package_json(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "example-1.2.3.tgz" - _make_tar_gz_with_contents( - archive_path, + cache_dir = _make_cache_tree_with_contents( + tmp_path, { "package/README.txt": "hello", }, ) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @pytest.mark.asyncio async def test_targz_structure_rejects_source_root_when_filename_has_no_suffix(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3-source/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3-source/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3.tar.gz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @pytest.mark.asyncio async def test_targz_structure_rejects_source_root_when_filename_has_src_suffix(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3-src.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3-source/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3-source/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3-src.tar.gz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @pytest.mark.asyncio async def test_targz_structure_rejects_src_root_when_filename_has_no_suffix(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3-src/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, - ) + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3-src/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3.tar.gz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @pytest.mark.asyncio async def test_targz_structure_rejects_src_root_when_filename_has_source_suffix(tmp_path: pathlib.Path) -> None: - archive_path = tmp_path / "apache-example-1.2.3-source.tar.gz" - _make_tar_gz(archive_path, ["apache-example-1.2.3-src/README.txt"]) - recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") - args = checks.FunctionArguments( - recorder=recorders.get_recorder(recorder), - asf_uid="", - project_name="test", - version_name="test", - revision_number="00001", - primary_rel_path=None, - extra_args={}, + cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3-src/README.txt"]) + recorder, args = await _targz_structure_args(tmp_path, "apache-example-1.2.3-source.tar.gz") + + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) + + assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) + + [email protected] +async def test_targz_structure_rejects_symlinked_package_json(tmp_path: pathlib.Path) -> None: + cache_dir = _make_cache_tree_with_contents( + tmp_path, + { + "package/README.txt": "hello", + "metadata.json": json.dumps({"name": "example", "version": "1.2.3"}), + }, ) + (cache_dir / "package" / "package.json").symlink_to(cache_dir / "metadata.json") + recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz") - await targz.structure(args) + with mock.patch.object(checks, "resolve_cache_dir", new=mock.AsyncMock(return_value=cache_dir)): + await targz.structure(args) assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ in recorder.messages) @@ -299,22 +246,26 @@ def test_zipformat_structure_rejects_package_root_without_package_json(tmp_path: assert "Root directory mismatch" in result["error"] -def _make_tar_gz(path: pathlib.Path, members: list[str]) -> None: - with tarfile.open(path, "w:gz") as tf: - for name in members: - data = f"data-{name}".encode() - info = tarfile.TarInfo(name=name) - info.size = len(data) - tf.addfile(info, io.BytesIO(data)) +def _make_cache_tree(base: pathlib.Path, members: list[str]) -> pathlib.Path: + """Create a directory tree simulating the quarantine extraction cache.""" + cache_dir = base / "cache" + cache_dir.mkdir() + for name in members: + path = cache_dir / name + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(f"data-{name}") + return cache_dir -def _make_tar_gz_with_contents(path: pathlib.Path, members: dict[str, str]) -> None: - with tarfile.open(path, "w:gz") as tf: - for name, content in members.items(): - data = content.encode() - info = tarfile.TarInfo(name=name) - info.size = len(data) - tf.addfile(info, io.BytesIO(data)) +def _make_cache_tree_with_contents(base: pathlib.Path, members: dict[str, str]) -> pathlib.Path: + """Create a directory tree with specific file contents.""" + cache_dir = base / "cache" + cache_dir.mkdir() + for name, content in members.items(): + path = cache_dir / name + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content) + return cache_dir def _make_zip(path: pathlib.Path, members: list[str]) -> None: @@ -327,3 +278,20 @@ def _make_zip_with_contents(path: pathlib.Path, members: dict[str, str]) -> None with zipfile.ZipFile(path, "w") as zf: for name, content in members.items(): zf.writestr(name, content) + + +async def _targz_structure_args( + tmp_path: pathlib.Path, archive_filename: str +) -> tuple[recorders.RecorderStub, checks.FunctionArguments]: + archive_path = tmp_path / archive_filename + recorder = recorders.RecorderStub(archive_path, "tests.unit.test_archive_root_variants") + args = checks.FunctionArguments( + recorder=recorders.get_recorder(recorder), + asf_uid="", + project_name="test", + version_name="test", + revision_number="00001", + primary_rel_path=archive_filename, + extra_args={}, + ) + return recorder, args --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
