This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git


The following commit(s) were added to refs/heads/sbp by this push:
     new 91157109 Use extracted archives in license header checks
91157109 is described below

commit 91157109b2dc31ec34e8c74e2262b7069854e082
Author: Sean B. Palmer <[email protected]>
AuthorDate: Wed Mar 11 14:46:26 2026 +0000

    Use extracted archives in license header checks
---
 atr/tasks/checks/license.py             | 142 +++++++++++++++++---------------
 tests/unit/test_archive_member_limit.py |  19 -----
 tests/unit/test_checks_license.py       |  34 ++++++--
 3 files changed, 100 insertions(+), 95 deletions(-)

diff --git a/atr/tasks/checks/license.py b/atr/tasks/checks/license.py
index 8720e7de..6d60e43f 100644
--- a/atr/tasks/checks/license.py
+++ b/atr/tasks/checks/license.py
@@ -29,7 +29,6 @@ import atr.log as log
 import atr.models.results as results
 import atr.models.schema as schema
 import atr.models.sql as sql
-import atr.tarzip as tarzip
 import atr.tasks.checks as checks
 import atr.util as util
 
@@ -82,7 +81,7 @@ INCLUDED_PATTERNS: Final[list[str]] = [
 # Release policy fields which this check relies on - used for result caching
 INPUT_POLICY_KEYS: Final[list[str]] = ["license_check_mode", 
"source_excludes_lightweight"]
 INPUT_EXTRA_ARGS: Final[list[str]] = ["is_podling"]
-CHECK_VERSION: Final[str] = "2"
+CHECK_VERSION: Final[str] = "3"
 
 # Types
 
@@ -184,6 +183,14 @@ async def headers(args: checks.FunctionArguments) -> 
results.Results | None:
     #     log.info(f"Using cached license headers result for 
{artifact_abs_path} (rel: {args.primary_rel_path})")
     #     return None
 
+    cache_dir = await checks.resolve_cache_dir(args)
+    if cache_dir is None:
+        await recorder.failure(
+            "Extracted archive tree is not available",
+            {"rel_path": args.primary_rel_path},
+        )
+        return None
+
     log.info(f"Checking license headers for {artifact_abs_path} (rel: 
{args.primary_rel_path})")
 
     is_source = await recorder.primary_path_is_source()
@@ -197,7 +204,8 @@ async def headers(args: checks.FunctionArguments) -> 
results.Results | None:
     else:
         excludes_source = "none"
 
-    return await _headers_core(recorder, str(artifact_abs_path), ignore_lines, 
excludes_source)
+    artifact_basename = os.path.basename(str(artifact_abs_path))
+    return await _headers_core(recorder, cache_dir, artifact_basename, 
ignore_lines, excludes_source)
 
 
 def headers_validate(content: bytes, _filename: str) -> tuple[bool, str | 
None]:
@@ -347,8 +355,10 @@ def _get_file_extension(filename: str) -> str | None:
     return ext[1:].lower()
 
 
-def _headers_check_core_logic(artifact_path: str, ignore_lines: list[str], 
excludes_source: str) -> Iterator[Result]:  # noqa: C901
-    """Verify Apache License headers in source files within an archive."""
+def _headers_check_core_logic(  # noqa: C901
+    cache_dir: pathlib.Path, artifact_basename: str, ignore_lines: list[str], 
excludes_source: str
+) -> Iterator[Result]:
+    """Verify Apache License headers in source files within an extracted cache 
tree."""
     # We could modify @Lucas-C/pre-commit-hooks instead for this
     # But hopefully this will be robust enough, at least for testing
     # First find and validate the root directory
@@ -364,49 +374,51 @@ def _headers_check_core_logic(artifact_path: str, 
ignore_lines: list[str], exclu
     #         data=None,
     #     )
 
-    artifact_basename = os.path.basename(artifact_path)
-    # log.info(f"Ignore lines: {ignore_lines}")
-
-    # Check files in the archive
-    try:
-        with tarzip.open_archive(artifact_path) as archive:
-            for member in archive:
-                if member.name and member.name.split("/")[-1].startswith("._"):
-                    # Metadata convention
-                    continue
-
-                ignore_path = "/" + artifact_basename + "/" + 
member.name.lstrip("/")
-                matcher = util.create_path_matcher(ignore_lines, 
pathlib.Path(ignore_path), pathlib.Path("/"))
-                # log.info(f"Checking {ignore_path} with matcher {matcher}")
-                if matcher(ignore_path):
-                    # log.info(f"Skipping {ignore_path} because it matches the 
ignore list")
-                    continue
-
-                match _headers_check_core_logic_process_file(archive, member):
-                    case ArtifactResult() | MemberResult() as result:
-                        artifact_data.files_checked += 1
-                        match result.status:
-                            case sql.CheckResultStatus.SUCCESS:
-                                artifact_data.files_with_valid_headers += 1
-                            case sql.CheckResultStatus.WARNING:
-                                artifact_data.files_with_invalid_headers += 1
-                            case sql.CheckResultStatus.FAILURE:
-                                artifact_data.files_with_invalid_headers += 1
-                            case sql.CheckResultStatus.BLOCKER:
-                                artifact_data.files_with_invalid_headers += 1
-                            case sql.CheckResultStatus.EXCEPTION:
-                                artifact_data.files_with_invalid_headers += 1
-                        yield result
-                    case MemberSkippedResult():
-                        artifact_data.files_skipped += 1
-    except tarzip.ArchiveMemberLimitExceededError as e:
+    if not cache_dir.is_dir():
         yield ArtifactResult(
             status=sql.CheckResultStatus.FAILURE,
-            message=f"Archive has too many members: {e}",
-            data={"error": str(e)},
+            message="Cache directory is not available",
+            data=None,
         )
         return
 
+    # log.info(f"Ignore lines: {ignore_lines}")
+
+    for dirpath, dirnames, filenames in os.walk(cache_dir):
+        dirnames.sort()
+        for filename in sorted(filenames):
+            if filename.startswith("._"):
+                # Metadata convention
+                continue
+
+            file_path = pathlib.Path(dirpath) / filename
+            rel_path = str(file_path.relative_to(cache_dir))
+
+            ignore_path = "/" + artifact_basename + "/" + rel_path
+            matcher = util.create_path_matcher(ignore_lines, 
pathlib.Path(ignore_path), pathlib.Path("/"))
+            # log.info(f"Checking {ignore_path} with matcher {matcher}")
+            if matcher(ignore_path):
+                # log.info(f"Skipping {ignore_path} because it matches the 
ignore list")
+                continue
+
+            match _headers_check_core_logic_process_file(file_path, rel_path):
+                case ArtifactResult() | MemberResult() as result:
+                    artifact_data.files_checked += 1
+                    match result.status:
+                        case sql.CheckResultStatus.SUCCESS:
+                            artifact_data.files_with_valid_headers += 1
+                        case sql.CheckResultStatus.WARNING:
+                            artifact_data.files_with_invalid_headers += 1
+                        case sql.CheckResultStatus.FAILURE:
+                            artifact_data.files_with_invalid_headers += 1
+                        case sql.CheckResultStatus.BLOCKER:
+                            artifact_data.files_with_invalid_headers += 1
+                        case sql.CheckResultStatus.EXCEPTION:
+                            artifact_data.files_with_invalid_headers += 1
+                    yield result
+                case MemberSkippedResult():
+                    artifact_data.files_skipped += 1
+
     yield ArtifactResult(
         status=sql.CheckResultStatus.SUCCESS,
         message=f"Checked {util.plural(artifact_data.files_checked, 'file')},"
@@ -418,56 +430,48 @@ def _headers_check_core_logic(artifact_path: str, 
ignore_lines: list[str], exclu
 
 
 def _headers_check_core_logic_process_file(
-    archive: tarzip.Archive,
-    member: tarzip.Member,
+    file_path: pathlib.Path,
+    rel_path: str,
 ) -> Result:
-    """Process a single file in an archive for license header verification."""
-    if not member.isfile():
+    """Process a single file for license header verification."""
+    if not file_path.is_file():
         return MemberSkippedResult(
-            path=member.name,
+            path=rel_path,
             reason="Not a file",
         )
 
     # Check if we should verify this file, based on extension
-    if not _headers_check_core_logic_should_check(member.name):
+    if not _headers_check_core_logic_should_check(rel_path):
         return MemberSkippedResult(
-            path=member.name,
+            path=rel_path,
             reason="Not a source file",
         )
 
-    # Extract and check the file
+    # Read and check the file
     try:
-        f = archive.extractfile(member)
-        if f is None:
-            return MemberResult(
-                status=sql.CheckResultStatus.EXCEPTION,
-                path=member.name,
-                message="Could not read file",
-                data=None,
-            )
-
         # Allow for some extra content at the start of the file
         # That may be shebangs, encoding declarations, etc.
-        content = f.read(4096)
-        is_valid, error = headers_validate(content, member.name)
+        with open(file_path, "rb") as f:
+            content = f.read(4096)
+        is_valid, error = headers_validate(content, rel_path)
         if is_valid:
             return MemberResult(
                 status=sql.CheckResultStatus.SUCCESS,
-                path=member.name,
+                path=rel_path,
                 message="Valid license header",
                 data=None,
             )
         else:
             return MemberResult(
                 status=sql.CheckResultStatus.FAILURE,
-                path=member.name,
+                path=rel_path,
                 message=f"Invalid license header: {error}",
                 data=None,
             )
     except Exception as e:
         return MemberResult(
             status=sql.CheckResultStatus.EXCEPTION,
-            path=member.name,
+            path=rel_path,
             message=f"Error processing file: {e!s}",
             data=None,
         )
@@ -491,11 +495,15 @@ def _headers_check_core_logic_should_check(filepath: str) 
-> bool:
 
 
 async def _headers_core(
-    recorder: checks.Recorder, artifact_abs_path: str, ignore_lines: 
list[str], excludes_source: str
+    recorder: checks.Recorder,
+    cache_dir: pathlib.Path,
+    artifact_basename: str,
+    ignore_lines: list[str],
+    excludes_source: str,
 ) -> None:
     try:
         for result in await asyncio.to_thread(
-            _headers_check_core_logic, str(artifact_abs_path), ignore_lines, 
excludes_source
+            _headers_check_core_logic, cache_dir, artifact_basename, 
ignore_lines, excludes_source
         ):
             match result:
                 case ArtifactResult():
diff --git a/tests/unit/test_archive_member_limit.py 
b/tests/unit/test_archive_member_limit.py
index 8b2c6bd4..dfa6dea0 100644
--- a/tests/unit/test_archive_member_limit.py
+++ b/tests/unit/test_archive_member_limit.py
@@ -24,7 +24,6 @@ import pytest
 import atr.archives as archives
 import atr.tarzip as tarzip
 import atr.tasks.checks as checks
-import atr.tasks.checks.license as license_checks
 import atr.tasks.checks.targz as targz
 import atr.tasks.checks.zipformat as zipformat
 import tests.unit.recorders as recorders
@@ -49,24 +48,6 @@ def test_extract_wraps_member_limit(tmp_path, monkeypatch):
     assert "too many members" in str(excinfo.value).lower()
 
 
-def test_license_headers_reports_member_limit(tmp_path, monkeypatch):
-    archive_path = tmp_path / "sample.tar"
-    _make_tar(archive_path, ["main.py", "README.txt", "extra.txt"])
-
-    original_open = tarzip.open_archive
-
-    def limited_open(path: str, *args, **kwargs):
-        return original_open(path, max_members=2)
-
-    monkeypatch.setattr(tarzip, "open_archive", limited_open)
-
-    results = list(license_checks._headers_check_core_logic(str(archive_path), 
[], "none"))
-    assert any(
-        isinstance(result, license_checks.ArtifactResult) and ("too many 
members" in result.message.lower())
-        for result in results
-    )
-
-
 def test_open_archive_disables_member_limit_for_negative(tmp_path):
     archive_path = tmp_path / "sample.zip"
     _make_zip(archive_path, ["a.txt", "b.txt", "c.txt"])
diff --git a/tests/unit/test_checks_license.py 
b/tests/unit/test_checks_license.py
index fdc16131..e9057e12 100644
--- a/tests/unit/test_checks_license.py
+++ b/tests/unit/test_checks_license.py
@@ -16,12 +16,14 @@
 # under the License.
 
 import pathlib
+import tarfile
 
 import atr.constants as constants
 import atr.models.sql as sql
 import atr.tasks.checks.license as license
 
 TEST_ARCHIVE = pathlib.Path(__file__).parent.parent / "e2e" / "test_files" / 
"apache-test-0.2.tar.gz"
+TEST_ARCHIVE_BASENAME: str = TEST_ARCHIVE.name
 
 NOTICE_VALID: str = (
     "Apache Test\n"
@@ -91,8 +93,9 @@ def test_files_valid_license_and_notice(tmp_path):
     assert all(r.status == sql.CheckResultStatus.SUCCESS for r in 
artifact_results)
 
 
-def test_headers_check_data_fields_match_model():
-    results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], 
"none"))
+def test_headers_check_data_fields_match_model(tmp_path):
+    cache_dir = _extract_test_archive(tmp_path)
+    results = list(license._headers_check_core_logic(cache_dir, 
TEST_ARCHIVE_BASENAME, [], "none"))
     artifact_results = [r for r in results if isinstance(r, 
license.ArtifactResult)]
     final_result = artifact_results[-1]
     expected_fields = set(license.ArtifactData.model_fields.keys())
@@ -100,9 +103,12 @@ def test_headers_check_data_fields_match_model():
     assert actual_fields == expected_fields
 
 
-def test_headers_check_excludes_matching_files():
-    results_without_excludes = 
list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], "none"))
-    results_with_excludes = 
list(license._headers_check_core_logic(str(TEST_ARCHIVE), ["*.py"], "policy"))
+def test_headers_check_excludes_matching_files(tmp_path):
+    cache_dir = _extract_test_archive(tmp_path)
+    results_without_excludes = 
list(license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, [], 
"none"))
+    results_with_excludes = list(
+        license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, 
["*.py"], "policy")
+    )
 
     def get_files_checked(results: list) -> int:
         for r in results:
@@ -115,16 +121,18 @@ def test_headers_check_excludes_matching_files():
     assert with_excludes < without_excludes
 
 
-def test_headers_check_includes_excludes_source_none():
-    results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], 
"none"))
+def test_headers_check_includes_excludes_source_none(tmp_path):
+    cache_dir = _extract_test_archive(tmp_path)
+    results = list(license._headers_check_core_logic(cache_dir, 
TEST_ARCHIVE_BASENAME, [], "none"))
     artifact_results = [r for r in results if isinstance(r, 
license.ArtifactResult)]
     assert len(artifact_results) > 0
     final_result = artifact_results[-1]
     assert final_result.data["excludes_source"] == "none"
 
 
-def test_headers_check_includes_excludes_source_policy():
-    results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], 
"policy"))
+def test_headers_check_includes_excludes_source_policy(tmp_path):
+    cache_dir = _extract_test_archive(tmp_path)
+    results = list(license._headers_check_core_logic(cache_dir, 
TEST_ARCHIVE_BASENAME, [], "policy"))
     artifact_results = [r for r in results if isinstance(r, 
license.ArtifactResult)]
     final_result = artifact_results[-1]
     assert final_result.data["excludes_source"] == "policy"
@@ -136,3 +144,11 @@ def _cache_with_root(tmp_path: pathlib.Path) -> 
pathlib.Path:
     root = cache_dir / "apache-test-0.2"
     root.mkdir()
     return cache_dir
+
+
+def _extract_test_archive(tmp_path: pathlib.Path) -> pathlib.Path:
+    cache_dir = tmp_path / "headers_cache"
+    cache_dir.mkdir()
+    with tarfile.open(TEST_ARCHIVE) as tf:
+        tf.extractall(cache_dir, filter="data")
+    return cache_dir


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to