This is an automated email from the ASF dual-hosted git repository. sbp pushed a commit to branch sbp in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
commit 3313c3a182caaa37d68d16f22266726be720c241 Author: Sean B. Palmer <[email protected]> AuthorDate: Wed Mar 11 14:46:26 2026 +0000 Use extracted archives in license header checks --- atr/tasks/checks/license.py | 142 +++++++++++++++++--------------- tests/unit/test_archive_member_limit.py | 19 ----- tests/unit/test_checks_license.py | 34 ++++++-- 3 files changed, 100 insertions(+), 95 deletions(-) diff --git a/atr/tasks/checks/license.py b/atr/tasks/checks/license.py index 8720e7de..6d60e43f 100644 --- a/atr/tasks/checks/license.py +++ b/atr/tasks/checks/license.py @@ -29,7 +29,6 @@ import atr.log as log import atr.models.results as results import atr.models.schema as schema import atr.models.sql as sql -import atr.tarzip as tarzip import atr.tasks.checks as checks import atr.util as util @@ -82,7 +81,7 @@ INCLUDED_PATTERNS: Final[list[str]] = [ # Release policy fields which this check relies on - used for result caching INPUT_POLICY_KEYS: Final[list[str]] = ["license_check_mode", "source_excludes_lightweight"] INPUT_EXTRA_ARGS: Final[list[str]] = ["is_podling"] -CHECK_VERSION: Final[str] = "2" +CHECK_VERSION: Final[str] = "3" # Types @@ -184,6 +183,14 @@ async def headers(args: checks.FunctionArguments) -> results.Results | None: # log.info(f"Using cached license headers result for {artifact_abs_path} (rel: {args.primary_rel_path})") # return None + cache_dir = await checks.resolve_cache_dir(args) + if cache_dir is None: + await recorder.failure( + "Extracted archive tree is not available", + {"rel_path": args.primary_rel_path}, + ) + return None + log.info(f"Checking license headers for {artifact_abs_path} (rel: {args.primary_rel_path})") is_source = await recorder.primary_path_is_source() @@ -197,7 +204,8 @@ async def headers(args: checks.FunctionArguments) -> results.Results | None: else: excludes_source = "none" - return await _headers_core(recorder, str(artifact_abs_path), ignore_lines, excludes_source) + artifact_basename = os.path.basename(str(artifact_abs_path)) + return await _headers_core(recorder, cache_dir, artifact_basename, ignore_lines, excludes_source) def headers_validate(content: bytes, _filename: str) -> tuple[bool, str | None]: @@ -347,8 +355,10 @@ def _get_file_extension(filename: str) -> str | None: return ext[1:].lower() -def _headers_check_core_logic(artifact_path: str, ignore_lines: list[str], excludes_source: str) -> Iterator[Result]: # noqa: C901 - """Verify Apache License headers in source files within an archive.""" +def _headers_check_core_logic( # noqa: C901 + cache_dir: pathlib.Path, artifact_basename: str, ignore_lines: list[str], excludes_source: str +) -> Iterator[Result]: + """Verify Apache License headers in source files within an extracted cache tree.""" # We could modify @Lucas-C/pre-commit-hooks instead for this # But hopefully this will be robust enough, at least for testing # First find and validate the root directory @@ -364,49 +374,51 @@ def _headers_check_core_logic(artifact_path: str, ignore_lines: list[str], exclu # data=None, # ) - artifact_basename = os.path.basename(artifact_path) - # log.info(f"Ignore lines: {ignore_lines}") - - # Check files in the archive - try: - with tarzip.open_archive(artifact_path) as archive: - for member in archive: - if member.name and member.name.split("/")[-1].startswith("._"): - # Metadata convention - continue - - ignore_path = "/" + artifact_basename + "/" + member.name.lstrip("/") - matcher = util.create_path_matcher(ignore_lines, pathlib.Path(ignore_path), pathlib.Path("/")) - # log.info(f"Checking {ignore_path} with matcher {matcher}") - if matcher(ignore_path): - # log.info(f"Skipping {ignore_path} because it matches the ignore list") - continue - - match _headers_check_core_logic_process_file(archive, member): - case ArtifactResult() | MemberResult() as result: - artifact_data.files_checked += 1 - match result.status: - case sql.CheckResultStatus.SUCCESS: - artifact_data.files_with_valid_headers += 1 - case sql.CheckResultStatus.WARNING: - artifact_data.files_with_invalid_headers += 1 - case sql.CheckResultStatus.FAILURE: - artifact_data.files_with_invalid_headers += 1 - case sql.CheckResultStatus.BLOCKER: - artifact_data.files_with_invalid_headers += 1 - case sql.CheckResultStatus.EXCEPTION: - artifact_data.files_with_invalid_headers += 1 - yield result - case MemberSkippedResult(): - artifact_data.files_skipped += 1 - except tarzip.ArchiveMemberLimitExceededError as e: + if not cache_dir.is_dir(): yield ArtifactResult( status=sql.CheckResultStatus.FAILURE, - message=f"Archive has too many members: {e}", - data={"error": str(e)}, + message="Cache directory is not available", + data=None, ) return + # log.info(f"Ignore lines: {ignore_lines}") + + for dirpath, dirnames, filenames in os.walk(cache_dir): + dirnames.sort() + for filename in sorted(filenames): + if filename.startswith("._"): + # Metadata convention + continue + + file_path = pathlib.Path(dirpath) / filename + rel_path = str(file_path.relative_to(cache_dir)) + + ignore_path = "/" + artifact_basename + "/" + rel_path + matcher = util.create_path_matcher(ignore_lines, pathlib.Path(ignore_path), pathlib.Path("/")) + # log.info(f"Checking {ignore_path} with matcher {matcher}") + if matcher(ignore_path): + # log.info(f"Skipping {ignore_path} because it matches the ignore list") + continue + + match _headers_check_core_logic_process_file(file_path, rel_path): + case ArtifactResult() | MemberResult() as result: + artifact_data.files_checked += 1 + match result.status: + case sql.CheckResultStatus.SUCCESS: + artifact_data.files_with_valid_headers += 1 + case sql.CheckResultStatus.WARNING: + artifact_data.files_with_invalid_headers += 1 + case sql.CheckResultStatus.FAILURE: + artifact_data.files_with_invalid_headers += 1 + case sql.CheckResultStatus.BLOCKER: + artifact_data.files_with_invalid_headers += 1 + case sql.CheckResultStatus.EXCEPTION: + artifact_data.files_with_invalid_headers += 1 + yield result + case MemberSkippedResult(): + artifact_data.files_skipped += 1 + yield ArtifactResult( status=sql.CheckResultStatus.SUCCESS, message=f"Checked {util.plural(artifact_data.files_checked, 'file')}," @@ -418,56 +430,48 @@ def _headers_check_core_logic(artifact_path: str, ignore_lines: list[str], exclu def _headers_check_core_logic_process_file( - archive: tarzip.Archive, - member: tarzip.Member, + file_path: pathlib.Path, + rel_path: str, ) -> Result: - """Process a single file in an archive for license header verification.""" - if not member.isfile(): + """Process a single file for license header verification.""" + if not file_path.is_file(): return MemberSkippedResult( - path=member.name, + path=rel_path, reason="Not a file", ) # Check if we should verify this file, based on extension - if not _headers_check_core_logic_should_check(member.name): + if not _headers_check_core_logic_should_check(rel_path): return MemberSkippedResult( - path=member.name, + path=rel_path, reason="Not a source file", ) - # Extract and check the file + # Read and check the file try: - f = archive.extractfile(member) - if f is None: - return MemberResult( - status=sql.CheckResultStatus.EXCEPTION, - path=member.name, - message="Could not read file", - data=None, - ) - # Allow for some extra content at the start of the file # That may be shebangs, encoding declarations, etc. - content = f.read(4096) - is_valid, error = headers_validate(content, member.name) + with open(file_path, "rb") as f: + content = f.read(4096) + is_valid, error = headers_validate(content, rel_path) if is_valid: return MemberResult( status=sql.CheckResultStatus.SUCCESS, - path=member.name, + path=rel_path, message="Valid license header", data=None, ) else: return MemberResult( status=sql.CheckResultStatus.FAILURE, - path=member.name, + path=rel_path, message=f"Invalid license header: {error}", data=None, ) except Exception as e: return MemberResult( status=sql.CheckResultStatus.EXCEPTION, - path=member.name, + path=rel_path, message=f"Error processing file: {e!s}", data=None, ) @@ -491,11 +495,15 @@ def _headers_check_core_logic_should_check(filepath: str) -> bool: async def _headers_core( - recorder: checks.Recorder, artifact_abs_path: str, ignore_lines: list[str], excludes_source: str + recorder: checks.Recorder, + cache_dir: pathlib.Path, + artifact_basename: str, + ignore_lines: list[str], + excludes_source: str, ) -> None: try: for result in await asyncio.to_thread( - _headers_check_core_logic, str(artifact_abs_path), ignore_lines, excludes_source + _headers_check_core_logic, cache_dir, artifact_basename, ignore_lines, excludes_source ): match result: case ArtifactResult(): diff --git a/tests/unit/test_archive_member_limit.py b/tests/unit/test_archive_member_limit.py index 8b2c6bd4..dfa6dea0 100644 --- a/tests/unit/test_archive_member_limit.py +++ b/tests/unit/test_archive_member_limit.py @@ -24,7 +24,6 @@ import pytest import atr.archives as archives import atr.tarzip as tarzip import atr.tasks.checks as checks -import atr.tasks.checks.license as license_checks import atr.tasks.checks.targz as targz import atr.tasks.checks.zipformat as zipformat import tests.unit.recorders as recorders @@ -49,24 +48,6 @@ def test_extract_wraps_member_limit(tmp_path, monkeypatch): assert "too many members" in str(excinfo.value).lower() -def test_license_headers_reports_member_limit(tmp_path, monkeypatch): - archive_path = tmp_path / "sample.tar" - _make_tar(archive_path, ["main.py", "README.txt", "extra.txt"]) - - original_open = tarzip.open_archive - - def limited_open(path: str, *args, **kwargs): - return original_open(path, max_members=2) - - monkeypatch.setattr(tarzip, "open_archive", limited_open) - - results = list(license_checks._headers_check_core_logic(str(archive_path), [], "none")) - assert any( - isinstance(result, license_checks.ArtifactResult) and ("too many members" in result.message.lower()) - for result in results - ) - - def test_open_archive_disables_member_limit_for_negative(tmp_path): archive_path = tmp_path / "sample.zip" _make_zip(archive_path, ["a.txt", "b.txt", "c.txt"]) diff --git a/tests/unit/test_checks_license.py b/tests/unit/test_checks_license.py index fdc16131..e9057e12 100644 --- a/tests/unit/test_checks_license.py +++ b/tests/unit/test_checks_license.py @@ -16,12 +16,14 @@ # under the License. import pathlib +import tarfile import atr.constants as constants import atr.models.sql as sql import atr.tasks.checks.license as license TEST_ARCHIVE = pathlib.Path(__file__).parent.parent / "e2e" / "test_files" / "apache-test-0.2.tar.gz" +TEST_ARCHIVE_BASENAME: str = TEST_ARCHIVE.name NOTICE_VALID: str = ( "Apache Test\n" @@ -91,8 +93,9 @@ def test_files_valid_license_and_notice(tmp_path): assert all(r.status == sql.CheckResultStatus.SUCCESS for r in artifact_results) -def test_headers_check_data_fields_match_model(): - results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], "none")) +def test_headers_check_data_fields_match_model(tmp_path): + cache_dir = _extract_test_archive(tmp_path) + results = list(license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, [], "none")) artifact_results = [r for r in results if isinstance(r, license.ArtifactResult)] final_result = artifact_results[-1] expected_fields = set(license.ArtifactData.model_fields.keys()) @@ -100,9 +103,12 @@ def test_headers_check_data_fields_match_model(): assert actual_fields == expected_fields -def test_headers_check_excludes_matching_files(): - results_without_excludes = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], "none")) - results_with_excludes = list(license._headers_check_core_logic(str(TEST_ARCHIVE), ["*.py"], "policy")) +def test_headers_check_excludes_matching_files(tmp_path): + cache_dir = _extract_test_archive(tmp_path) + results_without_excludes = list(license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, [], "none")) + results_with_excludes = list( + license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, ["*.py"], "policy") + ) def get_files_checked(results: list) -> int: for r in results: @@ -115,16 +121,18 @@ def test_headers_check_excludes_matching_files(): assert with_excludes < without_excludes -def test_headers_check_includes_excludes_source_none(): - results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], "none")) +def test_headers_check_includes_excludes_source_none(tmp_path): + cache_dir = _extract_test_archive(tmp_path) + results = list(license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, [], "none")) artifact_results = [r for r in results if isinstance(r, license.ArtifactResult)] assert len(artifact_results) > 0 final_result = artifact_results[-1] assert final_result.data["excludes_source"] == "none" -def test_headers_check_includes_excludes_source_policy(): - results = list(license._headers_check_core_logic(str(TEST_ARCHIVE), [], "policy")) +def test_headers_check_includes_excludes_source_policy(tmp_path): + cache_dir = _extract_test_archive(tmp_path) + results = list(license._headers_check_core_logic(cache_dir, TEST_ARCHIVE_BASENAME, [], "policy")) artifact_results = [r for r in results if isinstance(r, license.ArtifactResult)] final_result = artifact_results[-1] assert final_result.data["excludes_source"] == "policy" @@ -136,3 +144,11 @@ def _cache_with_root(tmp_path: pathlib.Path) -> pathlib.Path: root = cache_dir / "apache-test-0.2" root.mkdir() return cache_dir + + +def _extract_test_archive(tmp_path: pathlib.Path) -> pathlib.Path: + cache_dir = tmp_path / "headers_cache" + cache_dir.mkdir() + with tarfile.open(TEST_ARCHIVE) as tf: + tf.extractall(cache_dir, filter="data") + return cache_dir --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
