This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git

commit b280bcf1ee3df6de3c78a949eae2379fa7e5c34a
Author: Sean B. Palmer <[email protected]>
AuthorDate: Tue Mar 10 18:58:56 2026 +0000

    Check .tar.gz structure using the extracted files only
---
 atr/tasks/checks/__init__.py             |  17 +++
 atr/tasks/checks/targz.py                |  70 ++++-----
 tests/unit/test_archive_member_limit.py  |  20 ---
 tests/unit/test_archive_root_variants.py | 238 +++++++++++++------------------
 4 files changed, 155 insertions(+), 190 deletions(-)

diff --git a/atr/tasks/checks/__init__.py b/atr/tasks/checks/__init__.py
index 2e552c9b..c223f83d 100644
--- a/atr/tasks/checks/__init__.py
+++ b/atr/tasks/checks/__init__.py
@@ -312,6 +312,23 @@ def function_key(func: Callable[..., Any] | str) -> str:
     return func.__module__ + "." + func.__name__ if callable(func) else func
 
 
+async def resolve_cache_dir(args: FunctionArguments) -> pathlib.Path | None:
+    """Resolve the quarantine extraction cache directory for the primary 
archive."""
+    if args.primary_rel_path is None:
+        return None
+    paths_data = await attestable.load_paths(args.project_name, 
args.version_name, args.revision_number)
+    if paths_data is None:
+        return None
+    content_hash = paths_data.get(args.primary_rel_path)
+    if content_hash is None:
+        return None
+    cache_key = hashes.filesystem_cache_archives_key(content_hash)
+    cache_dir = file_paths.get_cache_archives_dir() / str(args.project_name) / 
str(args.version_name) / cache_key
+    if await aiofiles.os.path.isdir(cache_dir):
+        return cache_dir
+    return None
+
+
 async def resolve_cache_key(
     checker: str | Callable[..., Any],
     checker_version: str,
diff --git a/atr/tasks/checks/targz.py b/atr/tasks/checks/targz.py
index 19f753c8..02534032 100644
--- a/atr/tasks/checks/targz.py
+++ b/atr/tasks/checks/targz.py
@@ -16,6 +16,10 @@
 # under the License.
 
 import asyncio
+import contextlib
+import os
+import pathlib
+import stat
 from typing import Final
 
 import atr.archives as archives
@@ -28,7 +32,7 @@ import atr.util as util
 # Release policy fields which this check relies on - used for result caching
 INPUT_POLICY_KEYS: Final[list[str]] = []
 INPUT_EXTRA_ARGS: Final[list[str]] = []
-CHECK_VERSION: Final[str] = "1"
+CHECK_VERSION: Final[str] = "2"
 
 
 class RootDirectoryError(Exception):
@@ -56,50 +60,48 @@ async def integrity(args: checks.FunctionArguments) -> 
results.Results | None:
     return None
 
 
-def root_directory(tgz_path: str) -> tuple[str, bytes | None]:  # noqa: C901
-    """Find root directory and extract package/package.json if found."""
-    root = None
-    package_json: bytes | None = None
+def root_directory(cache_dir: pathlib.Path) -> tuple[str, bytes | None]:
+    """Find root directory and read package/package.json from the extracted 
tree."""
+    # The ._ prefix is a metadata convention
+    entries = sorted(e for e in os.listdir(cache_dir) if not 
e.startswith("._"))
 
-    with tarzip.open_archive(tgz_path) as archive:
-        for member in archive:
-            if member.name and member.name.split("/")[-1].startswith("._"):
-                # Metadata convention
-                continue
-
-            parts = member.name.split("/", 1)
-            if len(parts) >= 1:
-                if root is None:
-                    root = parts[0]
-                elif parts[0] != root:
-                    raise RootDirectoryError(f"Multiple root directories 
found: {root}, {parts[0]}")
-
-            if (root == "package") and (package_json is None):
-                member_name = member.name.lstrip("./")
-                if (member_name == "package/package.json") and member.isfile():
-                    size = member.size if hasattr(member, "size") else 0
-                    if (size > 0) and (size <= util.NPM_PACKAGE_JSON_MAX_SIZE):
-                        f = archive.extractfile(member)
-                        if f is not None:
-                            try:
-                                package_json = f.read()
-                            finally:
-                                f.close()
-
-    if not root:
+    if not entries:
         raise RootDirectoryError("No root directory found in archive")
+    if len(entries) > 1:
+        raise RootDirectoryError(f"Multiple root directories found: 
{entries[0]}, {entries[1]}")
+
+    root = entries[0]
+    package_json: bytes | None = None
+
+    if root == "package":
+        package_json_path = cache_dir / "package" / "package.json"
+        with contextlib.suppress(FileNotFoundError, OSError):
+            package_json_stat = package_json_path.lstat()
+            # We do this to avoid allowing package.json to be a symlink
+            if stat.S_ISREG(package_json_stat.st_mode):
+                size = package_json_stat.st_size
+                if (size > 0) and (size <= util.NPM_PACKAGE_JSON_MAX_SIZE):
+                    package_json = package_json_path.read_bytes()
 
     return root, package_json
 
 
 async def structure(args: checks.FunctionArguments) -> results.Results | None: 
 # noqa: C901
-    """Check the structure of a .tar.gz file."""
+    """Check the structure of a .tar.gz file using the extracted tree."""
     recorder = await args.recorder()
     if not (artifact_abs_path := await recorder.abs_path()):
         return None
     if await recorder.primary_path_is_binary():
         return None
 
+    cache_dir = await checks.resolve_cache_dir(args)
+    if cache_dir is None:
+        await recorder.failure(
+            "Extracted archive tree is not available",
+            {"rel_path": args.primary_rel_path},
+        )
+        return None
+
     filename = artifact_abs_path.name
     basename_from_filename: Final[str] = (
         filename.removesuffix(".tar.gz") if filename.endswith(".tar.gz") else 
filename.removesuffix(".tgz")
@@ -112,7 +114,7 @@ async def structure(args: checks.FunctionArguments) -> 
results.Results | None:
     )
 
     try:
-        root, package_json = await asyncio.to_thread(root_directory, 
str(artifact_abs_path))
+        root, package_json = await asyncio.to_thread(root_directory, cache_dir)
         data: dict[str, object] = {
             "root": root,
             "basename_from_filename": basename_from_filename,
@@ -149,8 +151,6 @@ async def structure(args: checks.FunctionArguments) -> 
results.Results | None:
             await recorder.failure(
                 f"Root directory '{root}' does not match expected names 
'{expected_roots_display}'", data
             )
-    except tarzip.ArchiveMemberLimitExceededError as e:
-        await recorder.failure(f"Archive has too many members: {e}", {"error": 
str(e)})
     except RootDirectoryError as e:
         await recorder.failure("Could not get the root directory of the 
archive", {"error": str(e)})
     except Exception as e:
diff --git a/tests/unit/test_archive_member_limit.py 
b/tests/unit/test_archive_member_limit.py
index c3225fb3..b2ae672b 100644
--- a/tests/unit/test_archive_member_limit.py
+++ b/tests/unit/test_archive_member_limit.py
@@ -142,26 +142,6 @@ async def 
test_targz_integrity_reports_member_limit(tmp_path, monkeypatch):
     assert any("too many members" in message.lower() for _, message, _ in 
recorder.messages)
 
 
[email protected]
-async def test_targz_structure_reports_member_limit(tmp_path, monkeypatch):
-    archive_path = tmp_path / "sample.tar"
-    # Must include the root directory here
-    _make_tar(archive_path, ["sample/a.txt", "sample/b.txt", "sample/c.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_member_limit")
-
-    original_open = tarzip.open_archive
-
-    def limited_open(path: str, *args, **kwargs):
-        return original_open(path, max_members=2)
-
-    monkeypatch.setattr(tarzip, "open_archive", limited_open)
-
-    args = await _args_for(recorder)
-    await targz.structure(args)
-
-    assert any("too many members" in message.lower() for _, message, _ in 
recorder.messages)
-
-
 def test_zipformat_integrity_reports_member_limit(tmp_path, monkeypatch):
     archive_path = tmp_path / "sample.zip"
     _make_zip(archive_path, ["a.txt", "b.txt", "c.txt"])
diff --git a/tests/unit/test_archive_root_variants.py 
b/tests/unit/test_archive_root_variants.py
index c71ce3d7..7b7b56f2 100644
--- a/tests/unit/test_archive_root_variants.py
+++ b/tests/unit/test_archive_root_variants.py
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import io
 import json
 import pathlib
-import tarfile
+import unittest.mock as mock
 import zipfile
 
 import pytest
@@ -32,26 +31,17 @@ import tests.unit.recorders as recorders
 
 @pytest.mark.asyncio
 async def test_targz_structure_accepts_npm_pack_root(tmp_path: pathlib.Path) 
-> None:
-    archive_path = tmp_path / "example-1.2.3.tgz"
-    _make_tar_gz_with_contents(
-        archive_path,
+    cache_dir = _make_cache_tree_with_contents(
+        tmp_path,
         {
             "package/package.json": json.dumps({"name": "example", "version": 
"1.2.3"}),
             "package/README.txt": "hello",
         },
     )
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ 
in recorder.messages)
     assert not any(status == sql.CheckResultStatus.FAILURE.value for status, 
_, _ in recorder.messages)
@@ -59,66 +49,50 @@ async def 
test_targz_structure_accepts_npm_pack_root(tmp_path: pathlib.Path) ->
 
 @pytest.mark.asyncio
 async def test_targz_structure_accepts_source_suffix_variant(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3-source.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3-source.tar.gz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ 
in recorder.messages)
 
 
 @pytest.mark.asyncio
 async def test_targz_structure_accepts_src_suffix_variant(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3-src.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    cache_dir = _make_cache_tree(tmp_path, ["apache-example-1.2.3/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3-src.tar.gz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.SUCCESS.value for status, _, _ 
in recorder.messages)
 
 
[email protected]
+async def test_targz_structure_fails_when_cache_unavailable(tmp_path: 
pathlib.Path) -> None:
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3.tar.gz")
+
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=None)):
+        await targz.structure(args)
+
+    assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
+    assert any("extracted archive tree is not available" in message.lower() 
for _, message, _ in recorder.messages)
+
+
 @pytest.mark.asyncio
 async def test_targz_structure_rejects_npm_pack_filename_mismatch(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "example-1.2.3.tgz"
-    _make_tar_gz_with_contents(
-        archive_path,
+    cache_dir = _make_cache_tree_with_contents(
+        tmp_path,
         {
             "package/package.json": json.dumps({"name": "different", 
"version": "1.2.3"}),
             "package/README.txt": "hello",
         },
     )
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
     assert any("npm pack layout detected" in message for _, message, _ in 
recorder.messages)
@@ -126,105 +100,78 @@ async def 
test_targz_structure_rejects_npm_pack_filename_mismatch(tmp_path: path
 
 @pytest.mark.asyncio
 async def 
test_targz_structure_rejects_package_root_without_package_json(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "example-1.2.3.tgz"
-    _make_tar_gz_with_contents(
-        archive_path,
+    cache_dir = _make_cache_tree_with_contents(
+        tmp_path,
         {
             "package/README.txt": "hello",
         },
     )
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
 
 
 @pytest.mark.asyncio
 async def 
test_targz_structure_rejects_source_root_when_filename_has_no_suffix(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3-source/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    cache_dir = _make_cache_tree(tmp_path, 
["apache-example-1.2.3-source/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3.tar.gz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
 
 
 @pytest.mark.asyncio
 async def 
test_targz_structure_rejects_source_root_when_filename_has_src_suffix(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3-src.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3-source/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    cache_dir = _make_cache_tree(tmp_path, 
["apache-example-1.2.3-source/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3-src.tar.gz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
 
 
 @pytest.mark.asyncio
 async def 
test_targz_structure_rejects_src_root_when_filename_has_no_suffix(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3-src/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
-    )
+    cache_dir = _make_cache_tree(tmp_path, 
["apache-example-1.2.3-src/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3.tar.gz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
 
 
 @pytest.mark.asyncio
 async def 
test_targz_structure_rejects_src_root_when_filename_has_source_suffix(tmp_path: 
pathlib.Path) -> None:
-    archive_path = tmp_path / "apache-example-1.2.3-source.tar.gz"
-    _make_tar_gz(archive_path, ["apache-example-1.2.3-src/README.txt"])
-    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
-    args = checks.FunctionArguments(
-        recorder=recorders.get_recorder(recorder),
-        asf_uid="",
-        project_name="test",
-        version_name="test",
-        revision_number="00001",
-        primary_rel_path=None,
-        extra_args={},
+    cache_dir = _make_cache_tree(tmp_path, 
["apache-example-1.2.3-src/README.txt"])
+    recorder, args = await _targz_structure_args(tmp_path, 
"apache-example-1.2.3-source.tar.gz")
+
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
+
+    assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
+
+
[email protected]
+async def test_targz_structure_rejects_symlinked_package_json(tmp_path: 
pathlib.Path) -> None:
+    cache_dir = _make_cache_tree_with_contents(
+        tmp_path,
+        {
+            "package/README.txt": "hello",
+            "metadata.json": json.dumps({"name": "example", "version": 
"1.2.3"}),
+        },
     )
+    (cache_dir / "package" / "package.json").symlink_to(cache_dir / 
"metadata.json")
+    recorder, args = await _targz_structure_args(tmp_path, "example-1.2.3.tgz")
 
-    await targz.structure(args)
+    with mock.patch.object(checks, "resolve_cache_dir", 
new=mock.AsyncMock(return_value=cache_dir)):
+        await targz.structure(args)
 
     assert any(status == sql.CheckResultStatus.FAILURE.value for status, _, _ 
in recorder.messages)
 
@@ -299,22 +246,26 @@ def 
test_zipformat_structure_rejects_package_root_without_package_json(tmp_path:
     assert "Root directory mismatch" in result["error"]
 
 
-def _make_tar_gz(path: pathlib.Path, members: list[str]) -> None:
-    with tarfile.open(path, "w:gz") as tf:
-        for name in members:
-            data = f"data-{name}".encode()
-            info = tarfile.TarInfo(name=name)
-            info.size = len(data)
-            tf.addfile(info, io.BytesIO(data))
+def _make_cache_tree(base: pathlib.Path, members: list[str]) -> pathlib.Path:
+    """Create a directory tree simulating the quarantine extraction cache."""
+    cache_dir = base / "cache"
+    cache_dir.mkdir()
+    for name in members:
+        path = cache_dir / name
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(f"data-{name}")
+    return cache_dir
 
 
-def _make_tar_gz_with_contents(path: pathlib.Path, members: dict[str, str]) -> 
None:
-    with tarfile.open(path, "w:gz") as tf:
-        for name, content in members.items():
-            data = content.encode()
-            info = tarfile.TarInfo(name=name)
-            info.size = len(data)
-            tf.addfile(info, io.BytesIO(data))
+def _make_cache_tree_with_contents(base: pathlib.Path, members: dict[str, 
str]) -> pathlib.Path:
+    """Create a directory tree with specific file contents."""
+    cache_dir = base / "cache"
+    cache_dir.mkdir()
+    for name, content in members.items():
+        path = cache_dir / name
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(content)
+    return cache_dir
 
 
 def _make_zip(path: pathlib.Path, members: list[str]) -> None:
@@ -327,3 +278,20 @@ def _make_zip_with_contents(path: pathlib.Path, members: 
dict[str, str]) -> None
     with zipfile.ZipFile(path, "w") as zf:
         for name, content in members.items():
             zf.writestr(name, content)
+
+
+async def _targz_structure_args(
+    tmp_path: pathlib.Path, archive_filename: str
+) -> tuple[recorders.RecorderStub, checks.FunctionArguments]:
+    archive_path = tmp_path / archive_filename
+    recorder = recorders.RecorderStub(archive_path, 
"tests.unit.test_archive_root_variants")
+    args = checks.FunctionArguments(
+        recorder=recorders.get_recorder(recorder),
+        asf_uid="",
+        project_name="test",
+        version_name="test",
+        revision_number="00001",
+        primary_rel_path=archive_filename,
+        extra_args={},
+    )
+    return recorder, args


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to