This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git

commit be6c4bfc82ce17db0aae1a327a0a0e8ba0f06513
Author: Sean B. Palmer <[email protected]>
AuthorDate: Wed Mar 11 14:57:36 2026 +0000

    Use extracted archives for source tree comparison checks
---
 atr/tasks/checks/compare.py       |  61 +++--------------
 tests/unit/test_checks_compare.py | 140 ++++++++++----------------------------
 2 files changed, 46 insertions(+), 155 deletions(-)

diff --git a/atr/tasks/checks/compare.py b/atr/tasks/checks/compare.py
index 54906530..e131d190 100644
--- a/atr/tasks/checks/compare.py
+++ b/atr/tasks/checks/compare.py
@@ -36,7 +36,6 @@ import dulwich.porcelain
 import dulwich.refs
 import pydantic
 
-import atr.archives as archives
 import atr.attestable as attestable
 import atr.config as config
 import atr.log as log
@@ -56,7 +55,7 @@ _PERMITTED_ADDED_PATHS: Final[dict[str, list[str]]] = {
 # Release policy fields which this check relies on - used for result caching
 INPUT_POLICY_KEYS: Final[list[str]] = []
 INPUT_EXTRA_ARGS: Final[list[str]] = ["github_tp_sha"]
-CHECK_VERSION: Final[str] = "1"
+CHECK_VERSION: Final[str] = "2"
 
 
 @dataclasses.dataclass
@@ -102,15 +101,18 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
     if payload is not None:
         if not (primary_abs_path := await recorder.abs_path()):
             return None
-        max_extract_size = args.extra_args.get("max_extract_size", 
_CONFIG.MAX_EXTRACT_SIZE)
-        chunk_size = args.extra_args.get("chunk_size", 
_CONFIG.EXTRACT_CHUNK_SIZE)
+        cache_dir = await checks.resolve_cache_dir(args)
+        if cache_dir is None:
+            await recorder.failure(
+                "Extracted archive tree is not available",
+                {"rel_path": args.primary_rel_path},
+            )
+            return None
         tmp_dir = paths.get_tmp_dir()
         await aiofiles.os.makedirs(tmp_dir, exist_ok=True)
         async with util.async_temporary_directory(prefix="trees-", 
dir=tmp_dir) as temp_dir:
             github_dir = temp_dir / "github"
-            archive_dir_path = temp_dir / "archive"
             await aiofiles.os.makedirs(github_dir, exist_ok=True)
-            await aiofiles.os.makedirs(archive_dir_path, exist_ok=True)
             checkout_dir = await _checkout_github_source(payload, github_dir)
             if checkout_dir is None:
                 await recorder.failure(
@@ -118,17 +120,11 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
                     {"repo_url": 
f"https://github.com/{payload.repository}.git";, "sha": payload.sha},
                 )
                 return None
-            if not await _decompress_archive(primary_abs_path, 
archive_dir_path, max_extract_size, chunk_size):
-                await recorder.failure(
-                    "Failed to extract source archive for comparison",
-                    {"archive_path": str(primary_abs_path), "extract_dir": 
str(archive_dir_path)},
-                )
-                return None
-            archive_root_result = await _find_archive_root(primary_abs_path, 
archive_dir_path)
+            archive_root_result = await _find_archive_root(primary_abs_path, 
cache_dir)
             if archive_root_result.root is None:
                 await recorder.failure(
                     "Could not determine archive root directory for 
comparison",
-                    {"archive_path": str(primary_abs_path), "extract_dir": 
str(archive_dir_path)},
+                    {"archive_path": str(primary_abs_path), "extract_dir": 
str(cache_dir)},
                 )
                 return None
             if archive_root_result.extra_entries:
@@ -141,7 +137,7 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
                     },
                 )
                 return None
-            archive_content_dir = archive_dir_path / archive_root_result.root
+            archive_content_dir = cache_dir / archive_root_result.root
             archive_dir = str(archive_content_dir)
             try:
                 comparison = await _compare_trees(github_dir, 
archive_content_dir)
@@ -304,41 +300,6 @@ def _compare_trees_rsync(repo_dir: pathlib.Path, 
archive_dir: pathlib.Path) -> T
     return TreeComparisonResult(invalid, repo_only)
 
 
-async def _decompress_archive(
-    archive_path: pathlib.Path,
-    extract_dir: pathlib.Path,
-    max_extract_size: int,
-    chunk_size: int,
-) -> bool:
-    started_ns = time.perf_counter_ns()
-    try:
-        extracted_size, _extracted_paths = await asyncio.to_thread(
-            archives.extract,
-            str(archive_path),
-            str(extract_dir),
-            max_size=max_extract_size,
-            chunk_size=chunk_size,
-        )
-    except (archives.ExtractionError, OSError):
-        elapsed_ms = (time.perf_counter_ns() - started_ns) / 1_000_000.0
-        log.exception(
-            "Failed to extract source archive for compare.source_trees",
-            archive_path=str(archive_path),
-            extract_dir=str(extract_dir),
-            extract_ms=elapsed_ms,
-        )
-        return False
-    elapsed_ms = (time.perf_counter_ns() - started_ns) / 1_000_000.0
-    log.debug(
-        "Extracted source archive for compare.source_trees",
-        archive_path=str(archive_path),
-        extract_dir=str(extract_dir),
-        extracted_bytes=extracted_size,
-        extract_ms=elapsed_ms,
-    )
-    return True
-
-
 def _ensure_clone_identity_env() -> None:
     os.environ["USER"] = _DEFAULT_USER
     os.environ["EMAIL"] = _DEFAULT_EMAIL
diff --git a/tests/unit/test_checks_compare.py 
b/tests/unit/test_checks_compare.py
index 7cb58981..b608241d 100644
--- a/tests/unit/test_checks_compare.py
+++ b/tests/unit/test_checks_compare.py
@@ -79,42 +79,12 @@ class CompareRecorder:
         return 
atr.tasks.checks.compare.TreeComparisonResult(set(self.invalid), 
set(self.repo_only))
 
 
-class DecompressRecorder:
-    def __init__(self, return_value: bool = True) -> None:
-        self.archive_path: pathlib.Path | None = None
-        self.extract_dir: pathlib.Path | None = None
-        self.max_extract_size: int | None = None
-        self.chunk_size: int | None = None
-        self.return_value = return_value
-
-    async def __call__(
-        self,
-        archive_path: pathlib.Path,
-        extract_dir: pathlib.Path,
-        max_extract_size: int,
-        chunk_size: int,
-    ) -> bool:
-        self.archive_path = archive_path
-        self.extract_dir = extract_dir
-        self.max_extract_size = max_extract_size
-        self.chunk_size = chunk_size
-        assert await aiofiles.os.path.exists(extract_dir)
-        return self.return_value
-
+class CacheDirResolver:
+    def __init__(self, cache_dir: pathlib.Path | None) -> None:
+        self.cache_dir = cache_dir
 
-class ExtractErrorRaiser:
-    def __call__(self, *args: object, **kwargs: object) -> tuple[int, 
list[str]]:
-        raise atr.tasks.checks.compare.archives.ExtractionError("Extraction 
error")
-
-
-class ExtractRecorder:
-    def __init__(self, extracted_size: int = 123) -> None:
-        self.calls: list[tuple[str, str, int, int]] = []
-        self.extracted_size = extracted_size
-
-    def __call__(self, archive_path: str, extract_dir: str, max_size: int, 
chunk_size: int) -> tuple[int, list[str]]:
-        self.calls.append((archive_path, extract_dir, max_size, chunk_size))
-        return self.extracted_size, []
+    async def __call__(self, args: object) -> pathlib.Path | None:
+        return self.cache_dir
 
 
 class FindArchiveRootRecorder:
@@ -528,36 +498,6 @@ def test_compare_trees_rsync_trees_match(monkeypatch: 
pytest.MonkeyPatch, tmp_pa
     assert result.repo_only == set()
 
 
[email protected]
-async def test_decompress_archive_calls_extract(monkeypatch: 
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
-    archive_path = tmp_path / "artifact.tar.gz"
-    extract_dir = tmp_path / "extracted"
-    extract_dir.mkdir()
-    extract_recorder = ExtractRecorder()
-
-    monkeypatch.setattr(atr.tasks.checks.compare.archives, "extract", 
extract_recorder)
-
-    result = await atr.tasks.checks.compare._decompress_archive(archive_path, 
extract_dir, 10, 20)
-
-    assert result is True
-    assert extract_recorder.calls == [(str(archive_path), str(extract_dir), 
10, 20)]
-
-
[email protected]
-async def test_decompress_archive_handles_extraction_error(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
-) -> None:
-    archive_path = tmp_path / "artifact.tar.gz"
-    extract_dir = tmp_path / "extracted"
-    extract_dir.mkdir()
-
-    monkeypatch.setattr(atr.tasks.checks.compare.archives, "extract", 
ExtractErrorRaiser())
-
-    result = await atr.tasks.checks.compare._decompress_archive(archive_path, 
extract_dir, 10, 20)
-
-    assert result is False
-
-
 @pytest.mark.asyncio
 async def test_find_archive_root_accepts_any_single_directory(tmp_path: 
pathlib.Path) -> None:
     archive_path = tmp_path / "my-project-1.0.0.tar.gz"
@@ -674,14 +614,15 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
-    decompress = DecompressRecorder()
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
     find_root = FindArchiveRootRecorder("artifact")
     compare = CompareRecorder(repo_only={"extra1.txt", "extra2.txt"})
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
@@ -691,8 +632,6 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     assert checkout.checkout_dir is not None
     checkout_dir = checkout.checkout_dir
     assert checkout_dir.name == "github"
-    assert decompress.extract_dir is not None
-    assert decompress.extract_dir.name == "archive"
     assert checkout_dir.parent.parent == tmp_root
     assert checkout_dir.parent.name.startswith("trees-")
     assert await aiofiles.os.path.exists(tmp_root)
@@ -716,11 +655,7 @@ async def 
test_source_trees_payload_none_skips_temp_workspace(monkeypatch: pytes
         "_checkout_github_source",
         RaiseAsync("_checkout_github_source should not be called"),
     )
-    monkeypatch.setattr(
-        atr.tasks.checks.compare,
-        "_decompress_archive",
-        RaiseAsync("_decompress_archive should not be called"),
-    )
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
RaiseAsync("resolve_cache_dir should not be called"))
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
RaiseSync("get_tmp_dir should not be called"))
 
     await atr.tasks.checks.compare.source_trees(args)
@@ -734,24 +669,16 @@ async def 
test_source_trees_permits_pkg_info_when_pyproject_toml_exists(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
+    cache_dir = tmp_path / "cache"
+    (cache_dir / "artifact").mkdir(parents=True)
+    (cache_dir / "artifact" / "pyproject.toml").write_text("[project]\nname = 
'test'\n")
     find_root = FindArchiveRootRecorder("artifact")
     compare = CompareRecorder(invalid={"PKG-INFO"})
     tmp_root = tmp_path / "temporary-root"
 
-    async def decompress_with_pyproject(
-        archive_path: pathlib.Path,
-        extract_dir: pathlib.Path,
-        max_extract_size: int,
-        chunk_size: int,
-    ) -> bool:
-        archive_content = extract_dir / "artifact"
-        archive_content.mkdir(parents=True, exist_ok=True)
-        (archive_content / "pyproject.toml").write_text("[project]\nname = 
'test'\n")
-        return True
-
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress_with_pyproject)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
@@ -770,14 +697,15 @@ async def 
test_source_trees_records_failure_when_archive_has_invalid_files(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
-    decompress = DecompressRecorder()
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
     find_root = FindArchiveRootRecorder("artifact")
     compare = CompareRecorder(invalid={"bad1.txt", "bad2.txt"}, 
repo_only={"ok.txt"})
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
@@ -801,13 +729,14 @@ async def 
test_source_trees_records_failure_when_archive_root_not_found(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
-    decompress = DecompressRecorder()
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
     find_root = FindArchiveRootRecorder(root=None)
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
 
@@ -820,29 +749,28 @@ async def 
test_source_trees_records_failure_when_archive_root_not_found(
 
 
 @pytest.mark.asyncio
-async def test_source_trees_records_failure_when_decompress_fails(
-    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+async def test_source_trees_records_failure_when_cache_dir_unavailable(
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     recorder = RecorderStub(True)
     args = _make_args(recorder)
     payload = _make_payload()
-    checkout = CheckoutRecorder()
-    decompress = DecompressRecorder(return_value=False)
-    tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
-    monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
-    monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(None))
+    monkeypatch.setattr(
+        atr.tasks.checks.compare,
+        "_checkout_github_source",
+        RaiseAsync("_checkout_github_source should not be called"),
+    )
+    monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
RaiseSync("get_tmp_dir should not be called"))
 
     await atr.tasks.checks.compare.source_trees(args)
 
     assert len(recorder.failure_calls) == 1
     message, data = recorder.failure_calls[0]
-    assert message == "Failed to extract source archive for comparison"
+    assert message == "Extracted archive tree is not available"
     assert isinstance(data, dict)
-    assert data["archive_path"] == str(await recorder.abs_path())
-    assert data["extract_dir"] == str(decompress.extract_dir)
 
 
 @pytest.mark.asyncio
@@ -853,13 +781,14 @@ async def 
test_source_trees_records_failure_when_extra_entries_in_archive(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
-    decompress = DecompressRecorder()
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
     find_root = FindArchiveRootRecorder(root="artifact", 
extra_entries=["README.txt", "extra.txt"])
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))
 
@@ -881,7 +810,8 @@ async def 
test_source_trees_reports_repo_only_sample_limited_to_five(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
-    decompress = DecompressRecorder()
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
     find_root = FindArchiveRootRecorder("artifact")
     repo_only_files = {f"file{i}.txt" for i in range(10)}
     compare = CompareRecorder(repo_only=repo_only_files)
@@ -889,7 +819,7 @@ async def 
test_source_trees_reports_repo_only_sample_limited_to_five(
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
-    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks, "resolve_cache_dir", 
CacheDirResolver(cache_dir))
     monkeypatch.setattr(atr.tasks.checks.compare, "_find_archive_root", 
find_root)
     monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
     monkeypatch.setattr(atr.tasks.checks.compare.paths, "get_tmp_dir", 
ReturnValue(tmp_root))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to